library(readxl)
library(ggplot2)
library(magrittr)
library(dplyr)
##
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
##
## filter, lag
## The following objects are masked from 'package:base':
##
## intersect, setdiff, setequal, union
library(randomForest)
## randomForest 4.6-14
## Type rfNews() to see new features/changes/bug fixes.
##
## Attaching package: 'randomForest'
## The following object is masked from 'package:dplyr':
##
## combine
## The following object is masked from 'package:ggplot2':
##
## margin
library(e1071)
library(naivebayes)
## naivebayes 0.9.7 loaded
library(caret)
## Loading required package: lattice
library(tidyverse)
## ── Attaching packages ────────────────────────────────────────────────────────────────────────────────────────────────────────── tidyverse 1.3.0 ──
## ✓ tibble 3.0.1 ✓ purrr 0.3.4
## ✓ tidyr 1.1.0 ✓ stringr 1.4.0
## ✓ readr 1.3.1 ✓ forcats 0.5.0
## ── Conflicts ───────────────────────────────────────────────────────────────────────────────────────────────────────────── tidyverse_conflicts() ──
## x randomForest::combine() masks dplyr::combine()
## x tidyr::extract() masks magrittr::extract()
## x dplyr::filter() masks stats::filter()
## x dplyr::lag() masks stats::lag()
## x purrr::lift() masks caret::lift()
## x randomForest::margin() masks ggplot2::margin()
## x purrr::set_names() masks magrittr::set_names()
library(caret)
library(SuperLearner)
## Loading required package: nnls
## Super Learner
## Version: 2.0-26
## Package created on 2019-10-27
library(gridExtra)
##
## Attaching package: 'gridExtra'
## The following object is masked from 'package:randomForest':
##
## combine
## The following object is masked from 'package:dplyr':
##
## combine
library(lmtest)
## Loading required package: zoo
##
## Attaching package: 'zoo'
## The following objects are masked from 'package:base':
##
## as.Date, as.Date.numeric
library(class)
library(corrplot)
## corrplot 0.84 loaded
library(GoodmanKruskal)
library(MASS)
##
## Attaching package: 'MASS'
## The following object is masked from 'package:dplyr':
##
## select
library(DMwR)
## Loading required package: grid
## Registered S3 method overwritten by 'quantmod':
## method from
## as.zoo.data.frame zoo
library(ROSE)
## Loaded ROSE 0.0-3
# Importing all the case study 2 data set from github
raw_dataset="https://raw.githubusercontent.com/lijjumathew/MSDS-Doing-Datascience-Project2/master/Data/CaseStudy2-data.csv"
no_attrition_dataset="https://raw.githubusercontent.com/lijjumathew/MSDS-Doing-Datascience-Project2/master/Data/CaseStudy2CompSet%20No%20Attrition.csv"
no_salary_dataset="https://raw.githubusercontent.com/lijjumathew/MSDS-Doing-Datascience-Project2/master/Data/CaseStudy2CompSet%20No%20Salary.csv"
raw_data <- read.csv(raw_dataset, sep = ",", header = TRUE)
no_attrition_data <- read.csv(no_attrition_dataset, sep = ",", header = TRUE)
no_salary_data <- read.csv(no_salary_dataset, sep = ",", header = TRUE)
# Checking details of case study 2 dataset
dim(raw_data)
## [1] 870 36
names(raw_data)
## [1] "ID" "Age"
## [3] "Attrition" "BusinessTravel"
## [5] "DailyRate" "Department"
## [7] "DistanceFromHome" "Education"
## [9] "EducationField" "EmployeeCount"
## [11] "EmployeeNumber" "EnvironmentSatisfaction"
## [13] "Gender" "HourlyRate"
## [15] "JobInvolvement" "JobLevel"
## [17] "JobRole" "JobSatisfaction"
## [19] "MaritalStatus" "MonthlyIncome"
## [21] "MonthlyRate" "NumCompaniesWorked"
## [23] "Over18" "OverTime"
## [25] "PercentSalaryHike" "PerformanceRating"
## [27] "RelationshipSatisfaction" "StandardHours"
## [29] "StockOptionLevel" "TotalWorkingYears"
## [31] "TrainingTimesLastYear" "WorkLifeBalance"
## [33] "YearsAtCompany" "YearsInCurrentRole"
## [35] "YearsSinceLastPromotion" "YearsWithCurrManager"
head(raw_data)
## ID Age Attrition BusinessTravel DailyRate Department
## 1 1 32 No Travel_Rarely 117 Sales
## 2 2 40 No Travel_Rarely 1308 Research & Development
## 3 3 35 No Travel_Frequently 200 Research & Development
## 4 4 32 No Travel_Rarely 801 Sales
## 5 5 24 No Travel_Frequently 567 Research & Development
## 6 6 27 No Travel_Frequently 294 Research & Development
## DistanceFromHome Education EducationField EmployeeCount EmployeeNumber
## 1 13 4 Life Sciences 1 859
## 2 14 3 Medical 1 1128
## 3 18 2 Life Sciences 1 1412
## 4 1 4 Marketing 1 2016
## 5 2 1 Technical Degree 1 1646
## 6 10 2 Life Sciences 1 733
## EnvironmentSatisfaction Gender HourlyRate JobInvolvement JobLevel
## 1 2 Male 73 3 2
## 2 3 Male 44 2 5
## 3 3 Male 60 3 3
## 4 3 Female 48 3 3
## 5 1 Female 32 3 1
## 6 4 Male 32 3 3
## JobRole JobSatisfaction MaritalStatus MonthlyIncome
## 1 Sales Executive 4 Divorced 4403
## 2 Research Director 3 Single 19626
## 3 Manufacturing Director 4 Single 9362
## 4 Sales Executive 4 Married 10422
## 5 Research Scientist 4 Single 3760
## 6 Manufacturing Director 1 Divorced 8793
## MonthlyRate NumCompaniesWorked Over18 OverTime PercentSalaryHike
## 1 9250 2 Y No 11
## 2 17544 1 Y No 14
## 3 19944 2 Y No 11
## 4 24032 1 Y No 19
## 5 17218 1 Y Yes 13
## 6 4809 1 Y No 21
## PerformanceRating RelationshipSatisfaction StandardHours StockOptionLevel
## 1 3 3 80 1
## 2 3 1 80 0
## 3 3 3 80 0
## 4 3 3 80 2
## 5 3 3 80 0
## 6 4 3 80 2
## TotalWorkingYears TrainingTimesLastYear WorkLifeBalance YearsAtCompany
## 1 8 3 2 5
## 2 21 2 4 20
## 3 10 2 3 2
## 4 14 3 3 14
## 5 6 2 3 6
## 6 9 4 2 9
## YearsInCurrentRole YearsSinceLastPromotion YearsWithCurrManager
## 1 2 0 3
## 2 7 4 9
## 3 2 2 2
## 4 10 5 7
## 5 3 1 3
## 6 7 1 7
#View(raw_data)
# Get a basic idea of all the variables.
summary(raw_data)
## ID Age Attrition BusinessTravel
## Min. : 1.0 Min. :18.00 No :730 Non-Travel : 94
## 1st Qu.:218.2 1st Qu.:30.00 Yes:140 Travel_Frequently:158
## Median :435.5 Median :35.00 Travel_Rarely :618
## Mean :435.5 Mean :36.83
## 3rd Qu.:652.8 3rd Qu.:43.00
## Max. :870.0 Max. :60.00
##
## DailyRate Department DistanceFromHome Education
## Min. : 103.0 Human Resources : 35 Min. : 1.000 Min. :1.000
## 1st Qu.: 472.5 Research & Development:562 1st Qu.: 2.000 1st Qu.:2.000
## Median : 817.5 Sales :273 Median : 7.000 Median :3.000
## Mean : 815.2 Mean : 9.339 Mean :2.901
## 3rd Qu.:1165.8 3rd Qu.:14.000 3rd Qu.:4.000
## Max. :1499.0 Max. :29.000 Max. :5.000
##
## EducationField EmployeeCount EmployeeNumber EnvironmentSatisfaction
## Human Resources : 15 Min. :1 Min. : 1.0 Min. :1.000
## Life Sciences :358 1st Qu.:1 1st Qu.: 477.2 1st Qu.:2.000
## Marketing :100 Median :1 Median :1039.0 Median :3.000
## Medical :270 Mean :1 Mean :1029.8 Mean :2.701
## Other : 52 3rd Qu.:1 3rd Qu.:1561.5 3rd Qu.:4.000
## Technical Degree: 75 Max. :1 Max. :2064.0 Max. :4.000
##
## Gender HourlyRate JobInvolvement JobLevel
## Female:354 Min. : 30.00 Min. :1.000 Min. :1.000
## Male :516 1st Qu.: 48.00 1st Qu.:2.000 1st Qu.:1.000
## Median : 66.00 Median :3.000 Median :2.000
## Mean : 65.61 Mean :2.723 Mean :2.039
## 3rd Qu.: 83.00 3rd Qu.:3.000 3rd Qu.:3.000
## Max. :100.00 Max. :4.000 Max. :5.000
##
## JobRole JobSatisfaction MaritalStatus MonthlyIncome
## Sales Executive :200 Min. :1.000 Divorced:191 Min. : 1081
## Research Scientist :172 1st Qu.:2.000 Married :410 1st Qu.: 2840
## Laboratory Technician :153 Median :3.000 Single :269 Median : 4946
## Manufacturing Director : 87 Mean :2.709 Mean : 6390
## Healthcare Representative: 76 3rd Qu.:4.000 3rd Qu.: 8182
## Sales Representative : 53 Max. :4.000 Max. :19999
## (Other) :129
## MonthlyRate NumCompaniesWorked Over18 OverTime PercentSalaryHike
## Min. : 2094 Min. :0.000 Y:870 No :618 Min. :11.0
## 1st Qu.: 8092 1st Qu.:1.000 Yes:252 1st Qu.:12.0
## Median :14074 Median :2.000 Median :14.0
## Mean :14326 Mean :2.728 Mean :15.2
## 3rd Qu.:20456 3rd Qu.:4.000 3rd Qu.:18.0
## Max. :26997 Max. :9.000 Max. :25.0
##
## PerformanceRating RelationshipSatisfaction StandardHours StockOptionLevel
## Min. :3.000 Min. :1.000 Min. :80 Min. :0.0000
## 1st Qu.:3.000 1st Qu.:2.000 1st Qu.:80 1st Qu.:0.0000
## Median :3.000 Median :3.000 Median :80 Median :1.0000
## Mean :3.152 Mean :2.707 Mean :80 Mean :0.7839
## 3rd Qu.:3.000 3rd Qu.:4.000 3rd Qu.:80 3rd Qu.:1.0000
## Max. :4.000 Max. :4.000 Max. :80 Max. :3.0000
##
## TotalWorkingYears TrainingTimesLastYear WorkLifeBalance YearsAtCompany
## Min. : 0.00 Min. :0.000 Min. :1.000 Min. : 0.000
## 1st Qu.: 6.00 1st Qu.:2.000 1st Qu.:2.000 1st Qu.: 3.000
## Median :10.00 Median :3.000 Median :3.000 Median : 5.000
## Mean :11.05 Mean :2.832 Mean :2.782 Mean : 6.962
## 3rd Qu.:15.00 3rd Qu.:3.000 3rd Qu.:3.000 3rd Qu.:10.000
## Max. :40.00 Max. :6.000 Max. :4.000 Max. :40.000
##
## YearsInCurrentRole YearsSinceLastPromotion YearsWithCurrManager
## Min. : 0.000 Min. : 0.000 Min. : 0.00
## 1st Qu.: 2.000 1st Qu.: 0.000 1st Qu.: 2.00
## Median : 3.000 Median : 1.000 Median : 3.00
## Mean : 4.205 Mean : 2.169 Mean : 4.14
## 3rd Qu.: 7.000 3rd Qu.: 3.000 3rd Qu.: 7.00
## Max. :18.000 Max. :15.000 Max. :17.00
##
# This gives us a high level view of the case study 2 dataset.
# We can see that we don't have any NULL values in our case study 2 dataset.
# We can also see that we just have 140 attrition records out of 870 which is approximately 16%.
str(raw_data)
## 'data.frame': 870 obs. of 36 variables:
## $ ID : int 1 2 3 4 5 6 7 8 9 10 ...
## $ Age : int 32 40 35 32 24 27 41 37 34 34 ...
## $ Attrition : Factor w/ 2 levels "No","Yes": 1 1 1 1 1 1 1 1 1 1 ...
## $ BusinessTravel : Factor w/ 3 levels "Non-Travel","Travel_Frequently",..: 3 3 2 3 2 2 3 3 3 2 ...
## $ DailyRate : int 117 1308 200 801 567 294 1283 309 1333 653 ...
## $ Department : Factor w/ 3 levels "Human Resources",..: 3 2 2 3 2 2 2 3 3 2 ...
## $ DistanceFromHome : int 13 14 18 1 2 10 5 10 10 10 ...
## $ Education : int 4 3 2 4 1 2 5 4 4 4 ...
## $ EducationField : Factor w/ 6 levels "Human Resources",..: 2 4 2 3 6 2 4 2 2 6 ...
## $ EmployeeCount : int 1 1 1 1 1 1 1 1 1 1 ...
## $ EmployeeNumber : int 859 1128 1412 2016 1646 733 1448 1105 1055 1597 ...
## $ EnvironmentSatisfaction : int 2 3 3 3 1 4 2 4 3 4 ...
## $ Gender : Factor w/ 2 levels "Female","Male": 2 2 2 1 1 2 2 1 1 2 ...
## $ HourlyRate : int 73 44 60 48 32 32 90 88 87 92 ...
## $ JobInvolvement : int 3 2 3 3 3 3 4 2 3 2 ...
## $ JobLevel : int 2 5 3 3 1 3 1 2 1 2 ...
## $ JobRole : Factor w/ 9 levels "Healthcare Representative",..: 8 6 5 8 7 5 7 8 9 1 ...
## $ JobSatisfaction : int 4 3 4 4 4 1 3 4 3 3 ...
## $ MaritalStatus : Factor w/ 3 levels "Divorced","Married",..: 1 3 3 2 3 1 2 1 2 2 ...
## $ MonthlyIncome : int 4403 19626 9362 10422 3760 8793 2127 6694 2220 5063 ...
## $ MonthlyRate : int 9250 17544 19944 24032 17218 4809 5561 24223 18410 15332 ...
## $ NumCompaniesWorked : int 2 1 2 1 1 1 2 2 1 1 ...
## $ Over18 : Factor w/ 1 level "Y": 1 1 1 1 1 1 1 1 1 1 ...
## $ OverTime : Factor w/ 2 levels "No","Yes": 1 1 1 1 2 1 2 2 2 1 ...
## $ PercentSalaryHike : int 11 14 11 19 13 21 12 14 19 14 ...
## $ PerformanceRating : int 3 3 3 3 3 4 3 3 3 3 ...
## $ RelationshipSatisfaction: int 3 1 3 3 3 3 1 3 4 2 ...
## $ StandardHours : int 80 80 80 80 80 80 80 80 80 80 ...
## $ StockOptionLevel : int 1 0 0 2 0 2 0 3 1 1 ...
## $ TotalWorkingYears : int 8 21 10 14 6 9 7 8 1 8 ...
## $ TrainingTimesLastYear : int 3 2 2 3 2 4 5 5 2 3 ...
## $ WorkLifeBalance : int 2 4 3 3 3 2 2 3 3 2 ...
## $ YearsAtCompany : int 5 20 2 14 6 9 4 1 1 8 ...
## $ YearsInCurrentRole : int 2 7 2 10 3 7 2 0 1 2 ...
## $ YearsSinceLastPromotion : int 0 4 2 5 1 1 0 0 0 7 ...
## $ YearsWithCurrManager : int 3 9 2 7 3 7 3 0 0 7 ...
summary(raw_data)
## ID Age Attrition BusinessTravel
## Min. : 1.0 Min. :18.00 No :730 Non-Travel : 94
## 1st Qu.:218.2 1st Qu.:30.00 Yes:140 Travel_Frequently:158
## Median :435.5 Median :35.00 Travel_Rarely :618
## Mean :435.5 Mean :36.83
## 3rd Qu.:652.8 3rd Qu.:43.00
## Max. :870.0 Max. :60.00
##
## DailyRate Department DistanceFromHome Education
## Min. : 103.0 Human Resources : 35 Min. : 1.000 Min. :1.000
## 1st Qu.: 472.5 Research & Development:562 1st Qu.: 2.000 1st Qu.:2.000
## Median : 817.5 Sales :273 Median : 7.000 Median :3.000
## Mean : 815.2 Mean : 9.339 Mean :2.901
## 3rd Qu.:1165.8 3rd Qu.:14.000 3rd Qu.:4.000
## Max. :1499.0 Max. :29.000 Max. :5.000
##
## EducationField EmployeeCount EmployeeNumber EnvironmentSatisfaction
## Human Resources : 15 Min. :1 Min. : 1.0 Min. :1.000
## Life Sciences :358 1st Qu.:1 1st Qu.: 477.2 1st Qu.:2.000
## Marketing :100 Median :1 Median :1039.0 Median :3.000
## Medical :270 Mean :1 Mean :1029.8 Mean :2.701
## Other : 52 3rd Qu.:1 3rd Qu.:1561.5 3rd Qu.:4.000
## Technical Degree: 75 Max. :1 Max. :2064.0 Max. :4.000
##
## Gender HourlyRate JobInvolvement JobLevel
## Female:354 Min. : 30.00 Min. :1.000 Min. :1.000
## Male :516 1st Qu.: 48.00 1st Qu.:2.000 1st Qu.:1.000
## Median : 66.00 Median :3.000 Median :2.000
## Mean : 65.61 Mean :2.723 Mean :2.039
## 3rd Qu.: 83.00 3rd Qu.:3.000 3rd Qu.:3.000
## Max. :100.00 Max. :4.000 Max. :5.000
##
## JobRole JobSatisfaction MaritalStatus MonthlyIncome
## Sales Executive :200 Min. :1.000 Divorced:191 Min. : 1081
## Research Scientist :172 1st Qu.:2.000 Married :410 1st Qu.: 2840
## Laboratory Technician :153 Median :3.000 Single :269 Median : 4946
## Manufacturing Director : 87 Mean :2.709 Mean : 6390
## Healthcare Representative: 76 3rd Qu.:4.000 3rd Qu.: 8182
## Sales Representative : 53 Max. :4.000 Max. :19999
## (Other) :129
## MonthlyRate NumCompaniesWorked Over18 OverTime PercentSalaryHike
## Min. : 2094 Min. :0.000 Y:870 No :618 Min. :11.0
## 1st Qu.: 8092 1st Qu.:1.000 Yes:252 1st Qu.:12.0
## Median :14074 Median :2.000 Median :14.0
## Mean :14326 Mean :2.728 Mean :15.2
## 3rd Qu.:20456 3rd Qu.:4.000 3rd Qu.:18.0
## Max. :26997 Max. :9.000 Max. :25.0
##
## PerformanceRating RelationshipSatisfaction StandardHours StockOptionLevel
## Min. :3.000 Min. :1.000 Min. :80 Min. :0.0000
## 1st Qu.:3.000 1st Qu.:2.000 1st Qu.:80 1st Qu.:0.0000
## Median :3.000 Median :3.000 Median :80 Median :1.0000
## Mean :3.152 Mean :2.707 Mean :80 Mean :0.7839
## 3rd Qu.:3.000 3rd Qu.:4.000 3rd Qu.:80 3rd Qu.:1.0000
## Max. :4.000 Max. :4.000 Max. :80 Max. :3.0000
##
## TotalWorkingYears TrainingTimesLastYear WorkLifeBalance YearsAtCompany
## Min. : 0.00 Min. :0.000 Min. :1.000 Min. : 0.000
## 1st Qu.: 6.00 1st Qu.:2.000 1st Qu.:2.000 1st Qu.: 3.000
## Median :10.00 Median :3.000 Median :3.000 Median : 5.000
## Mean :11.05 Mean :2.832 Mean :2.782 Mean : 6.962
## 3rd Qu.:15.00 3rd Qu.:3.000 3rd Qu.:3.000 3rd Qu.:10.000
## Max. :40.00 Max. :6.000 Max. :4.000 Max. :40.000
##
## YearsInCurrentRole YearsSinceLastPromotion YearsWithCurrManager
## Min. : 0.000 Min. : 0.000 Min. : 0.00
## 1st Qu.: 2.000 1st Qu.: 0.000 1st Qu.: 2.00
## Median : 3.000 Median : 1.000 Median : 3.00
## Mean : 4.205 Mean : 2.169 Mean : 4.14
## 3rd Qu.: 7.000 3rd Qu.: 3.000 3rd Qu.: 7.00
## Max. :18.000 Max. :15.000 Max. :17.00
##
str(no_attrition_data)
## 'data.frame': 300 obs. of 35 variables:
## $ ID : int 1171 1172 1173 1174 1175 1176 1177 1178 1179 1180 ...
## $ Age : int 35 33 26 55 29 51 52 39 31 31 ...
## $ BusinessTravel : Factor w/ 3 levels "Non-Travel","Travel_Frequently",..: 3 3 3 3 3 2 1 3 3 2 ...
## $ DailyRate : int 750 147 1330 1311 1246 1456 585 1387 1062 534 ...
## $ Department : Factor w/ 3 levels "Human Resources",..: 2 1 2 2 3 2 3 2 2 2 ...
## $ DistanceFromHome : int 28 2 21 2 19 1 29 10 24 20 ...
## $ Education : int 3 3 3 3 3 4 4 5 3 3 ...
## $ EducationField : Factor w/ 6 levels "Human Resources",..: 2 1 4 2 2 4 2 4 4 2 ...
## $ EmployeeCount : int 1 1 1 1 1 1 1 1 1 1 ...
## $ EmployeeNumber : int 1596 1207 1107 505 1497 145 2019 1618 1252 587 ...
## $ EnvironmentSatisfaction : int 2 2 1 3 3 1 1 2 3 1 ...
## $ Gender : Factor w/ 2 levels "Female","Male": 2 2 2 1 2 1 2 2 1 2 ...
## $ HourlyRate : int 46 99 37 97 77 30 40 76 96 66 ...
## $ JobInvolvement : int 4 3 3 3 2 2 3 3 2 3 ...
## $ JobLevel : int 2 1 1 4 2 3 1 2 2 3 ...
## $ JobRole : Factor w/ 9 levels "Healthcare Representative",..: 3 2 3 4 8 1 9 5 1 1 ...
## $ JobSatisfaction : int 3 3 3 4 3 1 4 1 1 3 ...
## $ MaritalStatus : Factor w/ 3 levels "Divorced","Married",..: 2 2 1 3 1 3 1 2 3 2 ...
## $ MonthlyIncome : int 3407 3600 2377 16659 8620 7484 3482 5377 6812 9824 ...
## $ MonthlyRate : int 25348 8429 19373 23258 23757 25796 19788 3835 17198 22908 ...
## $ NumCompaniesWorked : int 1 1 1 2 1 3 2 2 1 3 ...
## $ Over18 : Factor w/ 1 level "Y": 1 1 1 1 1 1 1 1 1 1 ...
## $ OverTime : Factor w/ 2 levels "No","Yes": 1 1 1 2 1 1 1 1 1 1 ...
## $ PercentSalaryHike : int 17 13 20 13 14 20 15 13 19 12 ...
## $ PerformanceRating : int 3 3 4 3 3 4 3 3 3 3 ...
## $ RelationshipSatisfaction: int 4 4 3 3 3 3 2 4 2 1 ...
## $ StandardHours : int 80 80 80 80 80 80 80 80 80 80 ...
## $ StockOptionLevel : int 2 1 1 0 2 0 2 3 0 0 ...
## $ TotalWorkingYears : int 10 5 1 30 10 23 16 10 10 12 ...
## $ TrainingTimesLastYear : int 3 2 0 2 3 1 3 3 2 2 ...
## $ WorkLifeBalance : int 2 3 2 3 3 2 2 3 3 3 ...
## $ YearsAtCompany : int 10 5 1 5 10 13 9 7 10 1 ...
## $ YearsInCurrentRole : int 9 4 1 4 7 12 8 7 9 0 ...
## $ YearsSinceLastPromotion : int 6 1 0 1 0 12 0 7 1 0 ...
## $ YearsWithCurrManager : int 8 4 0 2 4 8 0 7 8 0 ...
summary(no_attrition_data)
## ID Age BusinessTravel DailyRate
## Min. :1171 Min. :19.00 Non-Travel : 32 Min. : 102.0
## 1st Qu.:1246 1st Qu.:31.00 Travel_Frequently: 57 1st Qu.: 448.0
## Median :1320 Median :36.00 Travel_Rarely :211 Median : 775.0
## Mean :1320 Mean :37.86 Mean : 784.8
## 3rd Qu.:1395 3rd Qu.:44.00 3rd Qu.:1117.0
## Max. :1470 Max. :60.00 Max. :1490.0
##
## Department DistanceFromHome Education
## Human Resources : 11 Min. : 1.00 Min. :1.000
## Research & Development:209 1st Qu.: 2.00 1st Qu.:2.000
## Sales : 80 Median : 7.00 Median :3.000
## Mean : 9.26 Mean :2.973
## 3rd Qu.:14.00 3rd Qu.:4.000
## Max. :29.00 Max. :5.000
##
## EducationField EmployeeCount EmployeeNumber EnvironmentSatisfaction
## Human Resources : 7 Min. :1 Min. : 2.0 Min. :1.000
## Life Sciences :130 1st Qu.:1 1st Qu.: 508.8 1st Qu.:2.000
## Marketing : 27 Median :1 Median : 994.5 Median :3.000
## Medical : 94 Mean :1 Mean :1020.9 Mean :2.733
## Other : 12 3rd Qu.:1 3rd Qu.:1542.5 3rd Qu.:4.000
## Technical Degree: 30 Max. :1 Max. :2065.0 Max. :4.000
##
## Gender HourlyRate JobInvolvement JobLevel
## Female:105 Min. : 30.00 Min. :1.000 Min. :1.0
## Male :195 1st Qu.: 50.00 1st Qu.:2.000 1st Qu.:1.0
## Median : 66.00 Median :3.000 Median :2.0
## Mean : 66.07 Mean :2.743 Mean :2.2
## 3rd Qu.: 83.00 3rd Qu.:3.000 3rd Qu.:3.0
## Max. :100.00 Max. :4.000 Max. :5.0
##
## JobRole JobSatisfaction MaritalStatus MonthlyIncome
## Research Scientist :61 Min. :1.000 Divorced: 65 Min. : 1232
## Sales Executive :57 1st Qu.:2.000 Married :128 1st Qu.: 3034
## Laboratory Technician :55 Median :3.000 Single :107 Median : 5208
## Manufacturing Director :31 Mean :2.767 Mean : 7103
## Manager :30 3rd Qu.:4.000 3rd Qu.: 9750
## Healthcare Representative:29 Max. :4.000 Max. :19973
## (Other) :37
## MonthlyRate NumCompaniesWorked Over18 OverTime PercentSalaryHike
## Min. : 2097 Min. :0.000 Y:300 No :212 Min. :11.00
## 1st Qu.: 8420 1st Qu.:1.000 Yes: 88 1st Qu.:12.00
## Median :15091 Median :2.000 Median :14.00
## Mean :14499 Mean :2.547 Mean :15.17
## 3rd Qu.:20330 3rd Qu.:4.000 3rd Qu.:18.00
## Max. :26914 Max. :9.000 Max. :25.00
##
## PerformanceRating RelationshipSatisfaction StandardHours StockOptionLevel
## Min. :3.000 Min. :1.000 Min. :80 Min. :0.0000
## 1st Qu.:3.000 1st Qu.:2.000 1st Qu.:80 1st Qu.:0.0000
## Median :3.000 Median :3.000 Median :80 Median :1.0000
## Mean :3.153 Mean :2.803 Mean :80 Mean :0.7833
## 3rd Qu.:3.000 3rd Qu.:4.000 3rd Qu.:80 3rd Qu.:1.0000
## Max. :4.000 Max. :4.000 Max. :80 Max. :3.0000
##
## TotalWorkingYears TrainingTimesLastYear WorkLifeBalance YearsAtCompany
## Min. : 0.00 Min. :0.000 Min. :1.000 Min. : 0.000
## 1st Qu.: 6.00 1st Qu.:2.000 1st Qu.:2.000 1st Qu.: 3.000
## Median :10.00 Median :2.000 Median :3.000 Median : 5.000
## Mean :12.44 Mean :2.683 Mean :2.747 Mean : 7.527
## 3rd Qu.:18.00 3rd Qu.:3.000 3rd Qu.:3.000 3rd Qu.:10.000
## Max. :38.00 Max. :6.000 Max. :4.000 Max. :37.000
##
## YearsInCurrentRole YearsSinceLastPromotion YearsWithCurrManager
## Min. : 0.00 Min. : 0.00 Min. : 0.00
## 1st Qu.: 2.00 1st Qu.: 0.00 1st Qu.: 2.00
## Median : 3.00 Median : 1.00 Median : 3.00
## Mean : 4.33 Mean : 2.29 Mean : 4.38
## 3rd Qu.: 7.00 3rd Qu.: 3.00 3rd Qu.: 7.00
## Max. :18.00 Max. :15.00 Max. :17.00
##
summary(no_salary_data)
## ID Age Attrition BusinessTravel
## Min. : 871.0 Min. :18.00 No :249 Non-Travel : 24
## 1st Qu.: 945.8 1st Qu.:29.00 Yes: 51 Travel_Frequently: 62
## Median :1020.5 Median :36.00 Travel_Rarely :214
## Mean :1020.5 Mean :36.27
## 3rd Qu.:1095.2 3rd Qu.:42.00
## Max. :1170.0 Max. :60.00
##
## DailyRate Department DistanceFromHome Education
## Min. : 105.0 Human Resources : 17 Min. : 1.00 Min. :1.000
## 1st Qu.: 429.2 Research & Development:190 1st Qu.: 2.00 1st Qu.:2.000
## Median : 693.0 Sales : 93 Median : 7.00 Median :3.000
## Mean : 783.2 Mean : 8.70 Mean :2.887
## 3rd Qu.:1171.2 3rd Qu.:11.25 3rd Qu.:4.000
## Max. :1492.0 Max. :29.00 Max. :5.000
##
## EducationField EmployeeCount EmployeeNumber EnvironmentSatisfaction
## Human Resources : 5 Min. :1 Min. : 7 Min. :1.00
## Life Sciences :118 1st Qu.:1 1st Qu.: 477 1st Qu.:2.00
## Marketing : 32 Median :1 Median :1008 Median :3.00
## Medical :100 Mean :1 Mean :1014 Mean :2.77
## Other : 18 3rd Qu.:1 3rd Qu.:1569 3rd Qu.:4.00
## Technical Degree: 27 Max. :1 Max. :2068 Max. :4.00
##
## Gender HourlyRate JobInvolvement JobLevel
## Female:129 Min. : 30.00 Min. :1.000 Min. :1
## Male :171 1st Qu.: 48.00 1st Qu.:2.000 1st Qu.:1
## Median : 66.00 Median :3.000 Median :2
## Mean : 66.52 Mean :2.737 Mean :2
## 3rd Qu.: 85.25 3rd Qu.:3.000 3rd Qu.:2
## Max. :100.00 Max. :4.000 Max. :5
##
## JobRole JobSatisfaction MaritalStatus MonthlyRate
## Sales Executive :69 Min. :1.000 Divorced: 71 Min. : 2122
## Research Scientist :59 1st Qu.:2.000 Married :135 1st Qu.: 7778
## Laboratory Technician :51 Median :3.000 Single : 94 Median :13508
## Manufacturing Director :27 Mean :2.747 Mean :14091
## Healthcare Representative:26 3rd Qu.:4.000 3rd Qu.:20464
## Manager :21 Max. :4.000 Max. :26999
## (Other) :47
## NumCompaniesWorked Over18 OverTime PercentSalaryHike PerformanceRating
## Min. :0.00 Y:300 No :224 Min. :11.00 Min. :3.00
## 1st Qu.:1.00 Yes: 76 1st Qu.:12.75 1st Qu.:3.00
## Median :2.00 Median :14.00 Median :3.00
## Mean :2.74 Mean :15.28 Mean :3.16
## 3rd Qu.:4.00 3rd Qu.:18.00 3rd Qu.:3.00
## Max. :9.00 Max. :25.00 Max. :4.00
##
## RelationshipSatisfaction StandardHours StockOptionLevel TotalWorkingYears
## Min. :1.000 Min. :80 Min. :0.0000 Min. : 0.00
## 1st Qu.:2.000 1st Qu.:80 1st Qu.:0.0000 1st Qu.: 6.00
## Median :3.000 Median :80 Median :1.0000 Median : 9.00
## Mean :2.637 Mean :80 Mean :0.8333 Mean :10.78
## 3rd Qu.:4.000 3rd Qu.:80 3rd Qu.:1.0000 3rd Qu.:14.00
## Max. :4.000 Max. :80 Max. :3.0000 Max. :40.00
##
## TrainingTimesLastYear WorkLifeBalance YearsAtCompany YearsInCurrentRole
## Min. :0.00 Min. :1.000 Min. : 0.000 Min. : 0.0
## 1st Qu.:2.00 1st Qu.:2.000 1st Qu.: 3.000 1st Qu.: 2.0
## Median :3.00 Median :3.000 Median : 5.000 Median : 3.0
## Mean :2.82 Mean :2.717 Mean : 6.623 Mean : 4.2
## 3rd Qu.:3.00 3rd Qu.:3.000 3rd Qu.: 9.000 3rd Qu.: 7.0
## Max. :6.00 Max. :4.000 Max. :33.000 Max. :16.0
##
## YearsSinceLastPromotion YearsWithCurrManager
## Min. : 0.00 Min. : 0.000
## 1st Qu.: 0.00 1st Qu.: 2.000
## Median : 1.00 Median : 3.000
## Mean : 2.14 Mean : 3.817
## 3rd Qu.: 3.00 3rd Qu.: 7.000
## Max. :15.00 Max. :15.000
##
str(no_salary_data)
## 'data.frame': 300 obs. of 35 variables:
## $ ID : int 871 872 873 874 875 876 877 878 879 880 ...
## $ Age : int 43 33 55 36 27 39 33 21 30 51 ...
## $ Attrition : Factor w/ 2 levels "No","Yes": 1 1 2 1 1 2 1 2 1 1 ...
## $ BusinessTravel : Factor w/ 3 levels "Non-Travel","Travel_Frequently",..: 2 3 3 1 3 3 1 2 2 3 ...
## $ DailyRate : int 1422 461 267 1351 1302 895 750 251 1312 1405 ...
## $ Department : Factor w/ 3 levels "Human Resources",..: 3 2 3 2 2 3 3 2 2 2 ...
## $ DistanceFromHome : int 2 13 13 9 19 5 22 10 23 11 ...
## $ Education : int 4 1 4 4 3 3 2 2 3 2 ...
## $ EducationField : Factor w/ 6 levels "Human Resources",..: 2 2 3 2 5 6 3 2 2 6 ...
## $ EmployeeCount : int 1 1 1 1 1 1 1 1 1 1 ...
## $ EmployeeNumber : int 1849 995 1372 1949 1619 42 160 1279 159 1367 ...
## $ EnvironmentSatisfaction : int 1 2 1 1 4 4 3 1 1 4 ...
## $ Gender : Factor w/ 2 levels "Female","Male": 2 1 2 2 2 2 2 1 2 1 ...
## $ HourlyRate : int 92 53 85 66 67 56 95 45 96 82 ...
## $ JobInvolvement : int 3 3 4 4 2 3 3 2 1 2 ...
## $ JobLevel : int 2 1 4 1 1 2 2 1 1 4 ...
## $ JobRole : Factor w/ 9 levels "Healthcare Representative",..: 8 7 8 3 3 9 8 3 7 5 ...
## $ JobSatisfaction : int 4 4 3 2 1 4 2 3 3 2 ...
## $ MaritalStatus : Factor w/ 3 levels "Divorced","Married",..: 2 3 3 2 1 2 2 3 1 3 ...
## $ MonthlyRate : int 19246 17241 9277 9238 16290 3335 15480 25308 22310 24439 ...
## $ NumCompaniesWorked : int 1 3 6 1 1 3 0 1 1 3 ...
## $ Over18 : Factor w/ 1 level "Y": 1 1 1 1 1 1 1 1 1 1 ...
## $ OverTime : Factor w/ 2 levels "No","Yes": 1 1 2 1 1 1 1 1 1 1 ...
## $ PercentSalaryHike : int 20 18 17 22 11 14 13 20 25 16 ...
## $ PerformanceRating : int 4 3 3 4 3 3 3 4 4 3 ...
## $ RelationshipSatisfaction: int 3 1 3 2 1 3 1 3 3 2 ...
## $ StandardHours : int 80 80 80 80 80 80 80 80 80 80 ...
## $ StockOptionLevel : int 1 0 0 0 2 1 1 0 3 0 ...
## $ TotalWorkingYears : int 7 5 24 5 7 19 8 2 10 29 ...
## $ TrainingTimesLastYear : int 5 4 2 3 3 6 2 2 2 1 ...
## $ WorkLifeBalance : int 3 3 2 3 3 4 4 1 2 2 ...
## $ YearsAtCompany : int 7 3 19 5 7 1 7 2 10 5 ...
## $ YearsInCurrentRole : int 7 2 7 4 7 0 7 2 7 2 ...
## $ YearsSinceLastPromotion : int 7 0 3 0 0 0 0 2 0 0 ...
## $ YearsWithCurrManager : int 7 2 8 2 7 0 7 2 9 3 ...
# Drop columns that doesnt make sense to the analysis.
# Employee count (always 1), Standard hours (always 80) and over 18 (always 18) variables are constant.
drops <- c("EmployeeCount", "EmployeeNumber", "Over18", "StandardHours")
raw_data <- raw_data[ , !(names(raw_data) %in% drops)]
# From the documentation we have reference values for a columns JobInvolvement, JobSatisfaction, PerformanceRating, RelationshipSatisfaction, WorkLifeBalance.
# Lets replace those values with actual values.
# We will also be removing a few columns from our raw data which we wont be using in analysis further.
imp_data <- raw_data %>%
mutate(JobInvolvement = as.factor(if_else(JobInvolvement == 1,"Low",if_else(JobInvolvement == 2, "Medium",if_else(JobInvolvement == 3, "High", "Very High")))), JobSatisfaction = as.factor(if_else(JobSatisfaction == 1, "Low",if_else(JobSatisfaction == 2, "Medium",if_else(JobSatisfaction == 3, "High","Very High")))), PerformanceRating = as.factor(if_else(PerformanceRating == 1, "Low",if_else(PerformanceRating == 2, "Good", if_else(PerformanceRating == 3, "Excellent", "Outstanding")))),RelationshipSatisfaction = as.factor(if_else(RelationshipSatisfaction == 1, "Low",if_else(RelationshipSatisfaction == 2, "Medium", if_else(RelationshipSatisfaction == 3, "High", "Very High")))),WorkLifeBalance = as.factor(if_else(WorkLifeBalance == 1, "Bad",if_else(WorkLifeBalance == 2, "Good", if_else(WorkLifeBalance == 3, "Better", "Best")))))
imp_no_attrition <- no_attrition_data %>%
mutate(JobInvolvement = as.factor(if_else(JobInvolvement == 1,"Low",if_else(JobInvolvement == 2, "Medium",if_else(JobInvolvement == 3, "High", "Very High")))), JobSatisfaction = as.factor(if_else(JobSatisfaction == 1, "Low",if_else(JobSatisfaction == 2, "Medium",if_else(JobSatisfaction == 3, "High","Very High")))), PerformanceRating = as.factor(if_else(PerformanceRating == 1, "Low",if_else(PerformanceRating == 2, "Good", if_else(PerformanceRating == 3, "Excellent", "Outstanding")))),RelationshipSatisfaction = as.factor(if_else(RelationshipSatisfaction == 1, "Low",if_else(RelationshipSatisfaction == 2, "Medium", if_else(RelationshipSatisfaction == 3, "High", "Very High")))),WorkLifeBalance = as.factor(if_else(WorkLifeBalance == 1, "Bad",if_else(WorkLifeBalance == 2, "Good", if_else(WorkLifeBalance == 3, "Better", "Best")))))
imp_no_salary <- no_salary_data %>%
mutate(JobInvolvement = as.factor(if_else(JobInvolvement == 1,"Low",if_else(JobInvolvement == 2, "Medium",if_else(JobInvolvement == 3, "High", "Very High")))), JobSatisfaction = as.factor(if_else(JobSatisfaction == 1, "Low",if_else(JobSatisfaction == 2, "Medium",if_else(JobSatisfaction == 3, "High","Very High")))), PerformanceRating = as.factor(if_else(PerformanceRating == 1, "Low",if_else(PerformanceRating == 2, "Good", if_else(PerformanceRating == 3, "Excellent", "Outstanding")))),RelationshipSatisfaction = as.factor(if_else(RelationshipSatisfaction == 1, "Low",if_else(RelationshipSatisfaction == 2, "Medium", if_else(RelationshipSatisfaction == 3, "High", "Very High")))),WorkLifeBalance = as.factor(if_else(WorkLifeBalance == 1, "Bad",if_else(WorkLifeBalance == 2, "Good", if_else(WorkLifeBalance == 3, "Better", "Best")))))
classification_data <- imp_data
summary(imp_data)
## ID Age Attrition BusinessTravel
## Min. : 1.0 Min. :18.00 No :730 Non-Travel : 94
## 1st Qu.:218.2 1st Qu.:30.00 Yes:140 Travel_Frequently:158
## Median :435.5 Median :35.00 Travel_Rarely :618
## Mean :435.5 Mean :36.83
## 3rd Qu.:652.8 3rd Qu.:43.00
## Max. :870.0 Max. :60.00
##
## DailyRate Department DistanceFromHome Education
## Min. : 103.0 Human Resources : 35 Min. : 1.000 Min. :1.000
## 1st Qu.: 472.5 Research & Development:562 1st Qu.: 2.000 1st Qu.:2.000
## Median : 817.5 Sales :273 Median : 7.000 Median :3.000
## Mean : 815.2 Mean : 9.339 Mean :2.901
## 3rd Qu.:1165.8 3rd Qu.:14.000 3rd Qu.:4.000
## Max. :1499.0 Max. :29.000 Max. :5.000
##
## EducationField EnvironmentSatisfaction Gender HourlyRate
## Human Resources : 15 Min. :1.000 Female:354 Min. : 30.00
## Life Sciences :358 1st Qu.:2.000 Male :516 1st Qu.: 48.00
## Marketing :100 Median :3.000 Median : 66.00
## Medical :270 Mean :2.701 Mean : 65.61
## Other : 52 3rd Qu.:4.000 3rd Qu.: 83.00
## Technical Degree: 75 Max. :4.000 Max. :100.00
##
## JobInvolvement JobLevel JobRole
## High :514 Min. :1.000 Sales Executive :200
## Low : 47 1st Qu.:1.000 Research Scientist :172
## Medium :228 Median :2.000 Laboratory Technician :153
## Very High: 81 Mean :2.039 Manufacturing Director : 87
## 3rd Qu.:3.000 Healthcare Representative: 76
## Max. :5.000 Sales Representative : 53
## (Other) :129
## JobSatisfaction MaritalStatus MonthlyIncome MonthlyRate
## High :254 Divorced:191 Min. : 1081 Min. : 2094
## Low :179 Married :410 1st Qu.: 2840 1st Qu.: 8092
## Medium :166 Single :269 Median : 4946 Median :14074
## Very High:271 Mean : 6390 Mean :14326
## 3rd Qu.: 8182 3rd Qu.:20456
## Max. :19999 Max. :26997
##
## NumCompaniesWorked OverTime PercentSalaryHike PerformanceRating
## Min. :0.000 No :618 Min. :11.0 Excellent :738
## 1st Qu.:1.000 Yes:252 1st Qu.:12.0 Outstanding:132
## Median :2.000 Median :14.0
## Mean :2.728 Mean :15.2
## 3rd Qu.:4.000 3rd Qu.:18.0
## Max. :9.000 Max. :25.0
##
## RelationshipSatisfaction StockOptionLevel TotalWorkingYears
## High :261 Min. :0.0000 Min. : 0.00
## Low :174 1st Qu.:0.0000 1st Qu.: 6.00
## Medium :171 Median :1.0000 Median :10.00
## Very High:264 Mean :0.7839 Mean :11.05
## 3rd Qu.:1.0000 3rd Qu.:15.00
## Max. :3.0000 Max. :40.00
##
## TrainingTimesLastYear WorkLifeBalance YearsAtCompany YearsInCurrentRole
## Min. :0.000 Bad : 48 Min. : 0.000 Min. : 0.000
## 1st Qu.:2.000 Best : 98 1st Qu.: 3.000 1st Qu.: 2.000
## Median :3.000 Better:532 Median : 5.000 Median : 3.000
## Mean :2.832 Good :192 Mean : 6.962 Mean : 4.205
## 3rd Qu.:3.000 3rd Qu.:10.000 3rd Qu.: 7.000
## Max. :6.000 Max. :40.000 Max. :18.000
##
## YearsSinceLastPromotion YearsWithCurrManager
## Min. : 0.000 Min. : 0.00
## 1st Qu.: 0.000 1st Qu.: 2.00
## Median : 1.000 Median : 3.00
## Mean : 2.169 Mean : 4.14
## 3rd Qu.: 3.000 3rd Qu.: 7.00
## Max. :15.000 Max. :17.00
##
# Separate continuous and categorical variables for analysis
imp_data_conti <- imp_data[, !sapply(imp_data, is.factor)]
imp_data_categ <- imp_data[, sapply(imp_data, is.factor)]
# Computing the p value of correlations
cor.mtest <- function(mat, ...) {
mat <- as.matrix(mat)
n <- ncol(mat)
p.mat<- matrix(NA, n, n)
diag(p.mat) <- 0
for (i in 1:(n - 1)) {
for (j in (i + 1):n) {
tmp <- cor.test(mat[, i], mat[, j], ...)
p.mat[i, j] <- p.mat[j, i] <- tmp$p.value
}
}
colnames(p.mat) <- rownames(p.mat) <- colnames(mat)
p.mat
}
p.mat <- cor.mtest(imp_data_conti)
correlation <- cor(imp_data_conti)
# Correlation plot with significance level of 0.05
corrplot::corrplot(correlation, type="upper", order="hclust", title ="correlation", p.mat = p.mat, sig.level = 0.05)
cat_cor<- GKtauDataframe(imp_data_categ)
plot(cat_cor, corrColors = "blue")
EDA - Correlation Analysis. Below variables are correlated to each other 1) YearsSinceLastPromotion -> YearsWithCurrManager, YearsInCurrentRole, YearsAtCompany, TotalWorkingYears 2) YearsAtCompany -> JobLevel, MonthlyIncome 3) Age -> TotalWorkingYears, JobLevel, MonthlyIncome 4) TotalWorkingYears -> JobLevel, MonthlyIncome 5) Department -> JobRole
# Histogram of each variable
# Scatter Plot
pairs(imp_data_conti, pch=19)
# Box Plot
boxplot(imp_data_conti)
boxplot(imp_data_conti$MonthlyIncome, main="Boxplot Monthly Income")
# Employee Personal Demographics - Numerical Variables
p1 <- ggplot(imp_data) + geom_histogram(aes(Age), binwidth = 5, fill = "light green",col = "black")
p2 <- ggplot(imp_data) + geom_histogram(aes(DistanceFromHome), binwidth = 5, fill = "light green",col = "black")
p3 <- ggplot(imp_data) + geom_histogram(aes(NumCompaniesWorked), binwidth = 2, fill = "light green",col = "black")
p4 <- ggplot(imp_data) + geom_histogram(aes(TotalWorkingYears), binwidth = 4, fill = "light green",col = "black")
grid.arrange(p1, p2, p3, p4, ncol = 2, nrow = 2)
# Employee Billing Rate Demographics - Numerical Variables
p1 <- ggplot(imp_data) + geom_histogram(aes(HourlyRate), binwidth = 5, fill = "light green",col = "black")
p2 <- ggplot(imp_data) + geom_histogram(aes(DailyRate), binwidth = 100, fill = "light green",col = "black")
p3 <- ggplot(imp_data) + geom_histogram(aes(MonthlyRate), binwidth = 1000, fill = "light green",col = "black")
grid.arrange(p1, p2, p3, nrow = 3)
#Observations :
#There seems to be no clear cut pattern observed in these rates.
#Employee Work Demographics - Numerical Variables
p1 <- ggplot(imp_data) + geom_histogram(aes(Age), binwidth = 5, fill = "light green",col = "black")
p2 <- ggplot(imp_data) + geom_histogram(aes(PercentSalaryHike), binwidth = 1, fill = "light green",col = "black")
p3 <- ggplot(imp_data) + geom_histogram(aes(YearsAtCompany), binwidth = 2, fill = "light green",col = "black")
p4 <- ggplot(imp_data) + geom_histogram(aes(YearsInCurrentRole), binwidth = 2, fill = "light green",col = "black")
p5 <- ggplot(imp_data) + geom_histogram(aes(YearsSinceLastPromotion), binwidth = 2, fill = "light green",col = "black")
p6 <- ggplot(imp_data) + geom_histogram(aes(YearsWithCurrManager), binwidth = 2, fill = "light green",col = "black")
p7 <- ggplot(imp_data) + geom_histogram(aes(MonthlyIncome), binwidth = 1000, fill = "light green",col = "black")
p8 <- ggplot(imp_data) + geom_histogram(aes(DistanceFromHome), binwidth = 5, fill = "light green",col = "black")
p9 <- ggplot(imp_data) + geom_histogram(aes(NumCompaniesWorked), binwidth = 2, fill = "light green",col = "black")
grid.arrange(p1, p2, p3, p4, p5, p6,p7,p8,p9 ,nrow = 3, ncol = 3)
#Observations:
#All 6 variables are right skewed
#Employee Personal Demographics - Categorical Variables
p1<- imp_data %>%
dplyr::group_by(Gender) %>%
dplyr::summarise(counts = n()) %>%
ggplot(aes(x = as.factor(Gender), y = counts)) + geom_bar(stat = 'identity', fill = "light green") + ggtitle("Gender") +geom_text(aes(label=counts), size = 2.5, position=position_dodge(width=0.2), vjust=-0.25) + theme(plot.title = element_text(size =10),axis.text.x = element_text(size =7,angle = 45, hjust = 1),axis.title.x=element_blank()) + scale_y_continuous(limits = c(0, 900))
p2<- imp_data %>%
dplyr::group_by(Education) %>%
dplyr::summarise(counts = n()) %>%
ggplot(aes(x = as.factor(Education), y = counts)) + geom_bar(stat = 'identity', fill = "light green") + ggtitle("Education") +geom_text(aes(label=counts), size = 2.5, position=position_dodge(width=0.2), vjust=-0.25) + theme(plot.title = element_text(size =10),axis.text.x = element_text(size =7,angle = 45, hjust = 1),axis.title.x=element_blank()) + scale_y_continuous(limits = c(0, 650))
p3 <- imp_data %>%
dplyr::group_by(EducationField) %>%
dplyr::summarise(counts = n()) %>%
ggplot(aes(x = as.factor(EducationField), y = counts)) + geom_bar(stat = 'identity', fill = "light green") + ggtitle("Education Field") +geom_text(aes(label=counts), size = 2.5, position=position_dodge(width=0.2), vjust=-0.25) + theme(plot.title = element_text(size =10),axis.text.x = element_text(size =7,angle = 45, hjust = 1),axis.title.x=element_blank()) + scale_y_continuous(limits = c(0, 650))
p4 <- imp_data %>%
dplyr::group_by(MaritalStatus) %>%
dplyr::summarise(counts = n()) %>%
ggplot(aes(x = as.factor(MaritalStatus), y = counts)) + geom_bar(stat = 'identity', fill = "light green")+ ggtitle("Marital Status") +geom_text(aes(label=counts), size = 2.5, position=position_dodge(width=0.2), vjust=-0.25) + theme(plot.title = element_text(size =10),axis.text.x = element_text(size =7,angle = 45, hjust = 1),axis.title.x=element_blank()) + scale_y_continuous(limits = c(0, 750))
p5 <- imp_data %>%
dplyr::group_by(RelationshipSatisfaction) %>%
dplyr::summarise(counts = n()) %>%
ggplot(aes(x = as.factor(RelationshipSatisfaction), y = counts)) + geom_bar(stat = 'identity', fill = "light green") + ggtitle("Relationship Satisfaction") +geom_text(aes(label=counts), size = 2.5, position=position_dodge(width=0.2), vjust=-0.25) + theme(plot.title = element_text(size =10),axis.text.x = element_text(size =7,angle = 45, hjust = 1),axis.title.x=element_blank())+ scale_y_continuous(limits = c(0, 500))
p6 <- imp_data %>%
dplyr::group_by(WorkLifeBalance) %>%
dplyr::summarise(counts = n()) %>%
ggplot(aes(x = as.factor(WorkLifeBalance), y = counts)) + geom_bar(stat = 'identity', fill = "light green")+ ggtitle("Work Life Balance") +geom_text(aes(label=counts), size = 2.5, position=position_dodge(width=0.2), vjust=-0.25) + theme(plot.title = element_text(size =10),axis.text.x = element_text(size =7,angle = 45, hjust = 1),axis.title.x=element_blank()) + scale_y_continuous(limits = c(0, 950))
grid.arrange(p1, p2, p3, p4, p5, p6, nrow = 2, ncol = 3)
#Employee Work Demographics - Categorical Variables
p1 <- imp_data %>%
dplyr::group_by(BusinessTravel) %>%
dplyr::summarise(counts = n()) %>%
ggplot(aes(x = as.factor(BusinessTravel), y = counts)) + geom_bar(stat = 'identity', fill = "light green") + ggtitle("Business Travel") +geom_text(aes(label=counts), size = 2.5, position=position_dodge(width=0.2), vjust=-0.25)+ theme(plot.title = element_text(size =10),axis.text.x = element_text(size =10,angle = 45, hjust = 1),axis.title.x=element_blank())+ scale_y_continuous(limits = c(0, 1100))
p2 <- imp_data %>%
dplyr::group_by(EnvironmentSatisfaction) %>%
dplyr::summarise(counts = n()) %>%
ggplot(aes(x = as.factor(EnvironmentSatisfaction), y = counts)) + geom_bar(stat = 'identity', fill = "light green") + ggtitle("Environment Satisfaction") + geom_text(aes(label=counts), size = 2.5, position=position_dodge(width=0.2), vjust=-0.25) + theme(plot.title = element_text(size =10),axis.text.x = element_text(size =10,angle = 45, hjust = 1),axis.title.x=element_blank()) + scale_y_continuous(limits = c(0, 500))
p3 <- imp_data %>%
dplyr::group_by(JobInvolvement) %>%
dplyr::summarise(counts = n()) %>%
ggplot(aes(x = as.factor(JobInvolvement), y = counts)) + geom_bar(stat = 'identity', fill = "light green") + ggtitle("Job Involvement") +geom_text(aes(label=counts), size = 2.5, position=position_dodge(width=0.2), vjust=-0.25)+ theme(plot.title = element_text(size =10),axis.text.x = element_text(size =10,angle = 45, hjust = 1),axis.title.x=element_blank()) + scale_y_continuous(limits = c(0, 900))
p4 <- imp_data %>%
dplyr::group_by(JobSatisfaction) %>%
dplyr::summarise(counts = n()) %>%
ggplot(aes(x = as.factor(JobSatisfaction), y = counts)) + geom_bar(stat = 'identity', fill = "light green") + ggtitle("Job Satisfaction") +geom_text(aes(label=counts), size = 2.5, position=position_dodge(width=0.2), vjust=-0.25) + theme(plot.title = element_text(size =10),axis.text.x = element_text(size =7,angle = 45, hjust = 1),axis.title.x=element_blank()) + scale_y_continuous(limits = c(0, 500))
p5 <- imp_data %>%
dplyr::group_by(OverTime) %>%
dplyr::summarise(counts = n()) %>%
ggplot(aes(x = as.factor(OverTime), y = counts)) + geom_bar(stat = 'identity', fill = "light green") + ggtitle("Over Time") +geom_text(aes(label=counts), size = 2.5, position=position_dodge(width=0.2), vjust=-0.25)+ theme(plot.title = element_text(size =10),axis.text.x = element_text(size =7,angle = 45, hjust = 1),axis.title.x=element_blank()) + scale_y_continuous(limits = c(0, 1100))
p6 <- imp_data %>%
dplyr::group_by(PerformanceRating) %>%
dplyr::summarise(counts = n()) %>%
ggplot(aes(x = as.factor(PerformanceRating), y = counts)) + geom_bar(stat = 'identity', fill = "light green") + ggtitle("Performance Rating") +geom_text(aes(label=counts), size = 2.5, position=position_dodge(width=0.2), vjust=-0.25)+ theme(plot.title = element_text(size =10),axis.text.x = element_text(size =7,angle = 45, hjust = 1),axis.title.x=element_blank()) + scale_y_continuous(limits = c(0, 1300))
grid.arrange(p1,p2,p3,p4,p5,p6,nrow = 2)
p1 <- imp_data %>%
dplyr::group_by(Department) %>%
dplyr::summarise(counts = n()) %>%
ggplot(aes(x = as.factor(Department), y = counts)) + geom_bar(stat = 'identity', fill = "light green") + ggtitle("Department") +geom_text(aes(label=counts), size = 2.5, position=position_dodge(width=0.2), vjust=-0.25)+ theme(plot.title = element_text(size =10),axis.text.x = element_text(size = 7, angle = 45, hjust = 1),axis.title.x=element_blank())
p2 <- imp_data %>%
dplyr::group_by(JobRole) %>%
dplyr::summarise(counts = n()) %>%
ggplot(aes(x = as.factor(JobRole), y = counts)) + geom_bar(stat = 'identity', fill = "light green") + ggtitle("Job Role") +geom_text(aes(label=counts), size = 2.5, position=position_dodge(width=0.2), vjust=-0.25) + theme(plot.title = element_text(size =10),axis.text.x = element_text(size =7,angle = 45, hjust = 1),axis.title.x=element_blank())
grid.arrange(p1,p2 ,ncol = 2)
p1 <- imp_data %>%
dplyr::group_by(WorkLifeBalance) %>%
dplyr::summarise(counts = n()) %>%
ggplot(aes(x = as.factor(WorkLifeBalance), y = counts)) + geom_bar(stat = 'identity', fill = "light green")+ ggtitle("Work Life Balance") +geom_text(aes(label=counts), size = 2.5, position=position_dodge(width=0.2), vjust=-0.25) + theme(plot.title = element_text(size =10),axis.text.x = element_text(size =7,angle = 45, hjust = 1),axis.title.x=element_blank()) + scale_y_continuous(limits = c(0, 800))
p2 <- imp_data %>%
dplyr::group_by(BusinessTravel) %>%
dplyr::summarise(counts = n()) %>%
ggplot(aes(x = as.factor(BusinessTravel), y = counts)) + geom_bar(stat = 'identity', fill = "light green") + ggtitle("Business Travel") +geom_text(aes(label=counts), size = 2.5, position=position_dodge(width=0.2), vjust=-0.25)+ theme(plot.title = element_text(size =10),axis.text.x = element_text(size =10,angle = 45, hjust = 1),axis.title.x=element_blank())+ scale_y_continuous(limits = c(0, 800))
p3 <- imp_data %>%
dplyr::group_by(JobInvolvement) %>%
dplyr::summarise(counts = n()) %>%
ggplot(aes(x = as.factor(JobInvolvement), y = counts)) + geom_bar(stat = 'identity', fill = "light green") + ggtitle("Job Involvement") +geom_text(aes(label=counts), size = 2.5, position=position_dodge(width=0.2), vjust=-0.25)+ theme(plot.title = element_text(size =10),axis.text.x = element_text(size =10,angle = 45, hjust = 1),axis.title.x=element_blank()) + scale_y_continuous(limits = c(0, 800))
p4 <- imp_data %>%
dplyr::group_by(OverTime) %>%
dplyr::summarise(counts = n()) %>%
ggplot(aes(x = as.factor(OverTime), y = counts)) + geom_bar(stat = 'identity', fill = "light green") + ggtitle("Over Time") +geom_text(aes(label=counts), size = 2.5, position=position_dodge(width=0.2), vjust=-0.25)+ theme(plot.title = element_text(size =10),axis.text.x = element_text(size =7,angle = 45, hjust = 1),axis.title.x=element_blank()) + scale_y_continuous(limits = c(0, 800))
p5 <- imp_data %>%
dplyr::group_by(PerformanceRating) %>%
dplyr::summarise(counts = n()) %>%
ggplot(aes(x = as.factor(PerformanceRating), y = counts)) + geom_bar(stat = 'identity', fill = "light green") + ggtitle("Performance Rating") +geom_text(aes(label=counts), size = 2.5, position=position_dodge(width=0.2), vjust=-0.25)+ theme(plot.title = element_text(size =10),axis.text.x = element_text(size =7,angle = 45, hjust = 1),axis.title.x=element_blank()) + scale_y_continuous(limits = c(0, 800))
p6 <- imp_data %>%
dplyr::group_by(Department) %>%
dplyr::summarise(counts = n()) %>%
ggplot(aes(x = as.factor(Department), y = counts)) + geom_bar(stat = 'identity', fill = "light green") + ggtitle("Department") +geom_text(aes(label=counts), size = 2.5, position=position_dodge(width=0.2), vjust=-0.25)+ theme(plot.title = element_text(size =10),axis.text.x = element_text(size = 7, angle = 45, hjust = 1),axis.title.x=element_blank()) + scale_y_continuous(limits = c(0, 800))
grid.arrange(p1,p2,p3,p4,p5,p6,nrow = 2)
EDA - Observations - Univariate Analysis 1) MonthlyIncome has outliers. 2) Right Skewed data - JobLevel, MonthlyIncome, PercentageSalaryHike, TotalWorkingYears, YearsAtCompany, YearsSinceLastPromotion 3) More than 50% employees are male. 4) Almost 70% of the employees are from Life Science and Medical Background. 5) 47% of the employees are married where as 22% are divorced. 6) More than 60% of the employees feel they have a better work life balance. 7) More than 70% of the employees Travel rarely for work. 8) More than 70% of the people seem to be working over time
#After looking at every feature individually, let's now do some bivariate/multivariate analysis. Here we'll explore the independent variables with respect to the target variable. The objective is to discover hidden relationships between the independent variables and the target variable.
#Employee Personal Demographics - Numerical Variables
p1 <- imp_data %>%
ggplot(aes(x = Age, fill = Attrition)) + geom_density(alpha = 0.5) + ggtitle("Age") + theme(plot.title = element_text(size =10),axis.text.x = element_text(size =7,angle = 45, hjust = 1),axis.title.x=element_blank())
p2 <- imp_data %>%
ggplot(aes(x = DistanceFromHome, fill = Attrition)) + geom_density(alpha = 0.5) + ggtitle("Distance From Home") + theme(plot.title = element_text(size =10),axis.text.x = element_text(size =7,angle = 45, hjust = 1),axis.title.x=element_blank())
p3 <- imp_data %>%
ggplot(aes(x = NumCompaniesWorked, fill = Attrition)) + geom_density(alpha = 0.5) + ggtitle("Number of Companies") + theme(plot.title = element_text(size =10),axis.text.x = element_text(size =7,angle = 45, hjust = 1),axis.title.x=element_blank())
p4 <- imp_data %>%
ggplot(aes(x = TotalWorkingYears, fill = Attrition)) + geom_density(alpha = 0.5) + ggtitle("Total Working Years") + theme(plot.title = element_text(size =10),axis.text.x = element_text(size =7,angle = 45, hjust = 1),axis.title.x=element_blank())
grid.arrange(p1, p2, p3, p4, nrow = 2, ncol = 2)
#Employee Billing Rate Demographics - Numerical Variables
p1 <- imp_data %>%
ggplot(aes(x = HourlyRate, fill = Attrition)) + geom_density(alpha = 0.5) + ggtitle("Hourly Rate") + theme(plot.title = element_text(size =10),axis.text.x = element_text(size =7,angle = 45, hjust = 1),axis.title.x=element_blank())
p2 <- imp_data %>%
ggplot(aes(x = DailyRate, fill = Attrition)) + geom_density(alpha = 0.5) + ggtitle("Daily Rate") + theme(plot.title = element_text(size =10),axis.text.x = element_text(size =7,angle = 45, hjust = 1),axis.title.x=element_blank())
p3 <- imp_data %>%
ggplot(aes(x = MonthlyRate, fill = Attrition)) + geom_density(alpha = 0.5)+ ggtitle("Monthly Rate") + theme(plot.title = element_text(size =10),axis.text.x = element_text(size =7,angle = 45, hjust = 1),axis.title.x=element_blank())
grid.arrange(p1, p2, p3)
# Employee Work Demographics - Numerical Variables
p1 <- imp_data %>%
ggplot(aes(x = MonthlyIncome, fill = Attrition)) + geom_density(alpha = 0.5) + ggtitle("Monthly Income") + theme(plot.title = element_text(size =10),axis.text.x = element_text(size =7,angle = 45, hjust = 1),axis.title.x=element_blank())
p2 <- imp_data %>%
ggplot(aes(x = PercentSalaryHike, fill = Attrition)) + geom_density(alpha = 0.5) + ggtitle("Percentage Salary Hike") + theme(plot.title = element_text(size =10),axis.text.x = element_text(size =7,angle = 45, hjust = 1),axis.title.x=element_blank())
p3 <- imp_data %>%
ggplot(aes(x = YearsAtCompany, fill = Attrition)) + geom_density(alpha = 0.5) + ggtitle("Years At Company") + theme(plot.title = element_text(size =10),axis.text.x = element_text(size =7,angle = 45, hjust = 1),axis.title.x=element_blank())
p4 <- imp_data %>%
ggplot(aes(x = YearsInCurrentRole, fill = Attrition)) + geom_density(alpha = 0.5) + ggtitle("Years in Current Role") + theme(plot.title = element_text(size =10),axis.text.x = element_text(size =7,angle = 45, hjust = 1),axis.title.x=element_blank())
p5 <- imp_data %>%
ggplot(aes(x = YearsSinceLastPromotion, fill = Attrition)) + geom_density(alpha = 0.5) + ggtitle("Years Since Last Promotion") + theme(plot.title = element_text(size =10),axis.text.x = element_text(size =7,angle = 45, hjust = 1),axis.title.x=element_blank())
p6 <- imp_data %>%
ggplot(aes(x = YearsWithCurrManager, fill = Attrition)) + geom_density(alpha = 0.5) + ggtitle("Years With Current Manager") + theme(plot.title = element_text(size =10),axis.text.x = element_text(size =7,angle = 45, hjust = 1),axis.title.x=element_blank())
grid.arrange(p1, p2, p3, p4, p5, p6 , nrow = 3, ncol = 2)
p1 <- imp_data %>%
ggplot(aes(x = Age, fill = Attrition)) + geom_density(alpha = 0.5) + ggtitle("Age") + theme(plot.title = element_text(size =10),axis.text.x = element_text(size =7,angle = 45, hjust = 1),axis.title.x=element_blank())
p2 <- imp_data %>%
ggplot(aes(x = DistanceFromHome, fill = Attrition)) + geom_density(alpha = 0.5) + ggtitle("Distance From Home") + theme(plot.title = element_text(size =10),axis.text.x = element_text(size =7,angle = 45, hjust = 1),axis.title.x=element_blank())
p3 <- imp_data %>%
ggplot(aes(x = TotalWorkingYears, fill = Attrition)) + geom_density(alpha = 0.5) + ggtitle("Total Working Years") + theme(plot.title = element_text(size =10),axis.text.x = element_text(size =7,angle = 45, hjust = 1),axis.title.x=element_blank())
p4 <- imp_data %>%
ggplot(aes(x = MonthlyIncome, fill = Attrition)) + geom_density(alpha = 0.5) + ggtitle("Monthly Income") + theme(plot.title = element_text(size =10),axis.text.x = element_text(size =7,angle = 45, hjust = 1),axis.title.x=element_blank())
grid.arrange(p1, p2, p3, p4, nrow = 2, ncol = 2)
#Employee Personal Demographics - Categorical Variables
p1 <- imp_data %>%
dplyr::group_by(Gender) %>%
dplyr::summarise(attrition_rate = round((sum(if_else(Attrition == "Yes",1,0))/n()*100),2)) %>%
ggplot(aes(x = Gender, y = attrition_rate))+ geom_bar(stat = 'identity',fill = "light green") + ggtitle("Attrition Rate - Gender") + theme(plot.title = element_text(size =10),axis.text.x = element_text(size =7,angle = 45, hjust = 1),axis.title.x=element_blank()) +geom_text(aes(label=attrition_rate), size = 2.5, position=position_dodge(width=0.2), vjust=-0.25)+ scale_y_continuous(limits = c(0, 20))
p2 <- imp_data %>%
dplyr::group_by(Education) %>%
dplyr::summarise(attrition_rate = round((sum(if_else(Attrition == "Yes",1,0))/n()*100),2)) %>%
ggplot(aes(x = Education, y = attrition_rate))+ geom_bar(stat = 'identity',fill = "light green") + ggtitle("Attrition Rate - Education") + theme(plot.title = element_text(size =10),axis.text.x = element_text(size =7,angle = 45, hjust = 1),axis.title.x=element_blank()) +geom_text(aes(label=attrition_rate), size = 2.5, position=position_dodge(width=0.2), vjust=-0.25)+ scale_y_continuous(limits = c(0, 20))
p3 <- imp_data %>%
dplyr::group_by(EducationField) %>%
dplyr::summarise(attrition_rate = round((sum(if_else(Attrition == "Yes",1,0))/n()*100),2)) %>%
ggplot(aes(x = EducationField, y = attrition_rate))+ geom_bar(stat = 'identity',fill = "light green") + ggtitle("Attrition Rate - Education Field") + theme(plot.title = element_text(size =10),axis.text.x = element_text(size =7,angle = 45, hjust = 1),axis.title.x=element_blank()) +geom_text(aes(label=attrition_rate), size = 2.5, position=position_dodge(width=0.2), vjust=-0.25)+ scale_y_continuous(limits = c(0, 30))
p4 <- imp_data %>%
dplyr::group_by(MaritalStatus) %>%
dplyr::summarise(attrition_rate = round((sum(if_else(Attrition == "Yes",1,0))/n()*100),2)) %>%
ggplot(aes(x = MaritalStatus, y = attrition_rate))+ geom_bar(stat = 'identity',fill = "light green") + ggtitle("Attrition Rate - Marital Status") + theme(plot.title = element_text(size =10),axis.text.x = element_text(size =7,angle = 45, hjust = 1),axis.title.x=element_blank()) +geom_text(aes(label=attrition_rate), size = 2.5, position=position_dodge(width=0.2), vjust=-0.25)+ scale_y_continuous(limits = c(0, 40))
p5 <- imp_data %>%
dplyr::group_by(RelationshipSatisfaction) %>%
dplyr::summarise(attrition_rate = round((sum(if_else(Attrition == "Yes",1,0))/n()*100),2)) %>%
ggplot(aes(x = as.factor(RelationshipSatisfaction), y = attrition_rate))+ geom_bar(stat = 'identity',fill = "light green") + ggtitle("Attrition Rate - Relationship Satisfaction") + theme(plot.title = element_text(size =10),axis.text.x = element_text(size =7,angle = 45, hjust = 1),axis.title.x=element_blank()) +geom_text(aes(label=attrition_rate), size = 2.5, position=position_dodge(width=0.2), vjust=-0.25)+ scale_y_continuous(limits = c(0, 40))
p6 <- imp_data %>%
dplyr::group_by(WorkLifeBalance) %>%
dplyr::summarise(attrition_rate = round((sum(if_else(Attrition == "Yes",1,0))/n()*100),2)) %>%
ggplot(aes(x = as.factor(WorkLifeBalance), y = attrition_rate))+ geom_bar(stat = 'identity',fill = "light green") + ggtitle("Attrition Rate - Work Life Balance") + theme(plot.title = element_text(size =10),axis.text.x = element_text(size =7,angle = 45, hjust = 1),axis.title.x=element_blank()) + geom_text(aes(label=attrition_rate), size = 2.5, position=position_dodge(width=0.2), vjust=-0.15)+ scale_y_continuous(limits = c(0, 40))
grid.arrange(p1, p2, p3, p4, p5, p6, nrow = 2, ncol = 3)
#Employee Work Demographics - Categorical Variables
p1 <- imp_data %>%
dplyr::group_by(BusinessTravel) %>%
dplyr::summarise(attrition_rate = round((sum(if_else(Attrition == "Yes",1,0))/n()*100),2)) %>%
ggplot(aes(x = BusinessTravel, y = attrition_rate))+ geom_bar(stat = 'identity',fill = "light green") + ggtitle("Attrition Rate - Business Travel") + theme(plot.title = element_text(size =10),axis.text.x = element_text(size =7,angle = 45, hjust = 1),axis.title.x=element_blank()) +geom_text(aes(label=attrition_rate), size = 2.5, position=position_dodge(width=0.2), vjust=-0.25)+ scale_y_continuous(limits = c(0, 30))
p2 <- imp_data %>%
dplyr::group_by(EnvironmentSatisfaction) %>%
dplyr::summarise(attrition_rate = round((sum(if_else(Attrition == "Yes",1,0))/n()*100),2)) %>%
ggplot(aes(x = EnvironmentSatisfaction, y = attrition_rate))+ geom_bar(stat = 'identity',fill = "light green") + ggtitle("Attrition Rate - Environment Satisfaction") + theme(plot.title = element_text(size =10),axis.text.x = element_text(size =7,angle = 45, hjust = 1),axis.title.x=element_blank()) +geom_text(aes(label=attrition_rate), size = 2.5, position=position_dodge(width=0.2), vjust=-0.25)+ scale_y_continuous(limits = c(0, 30))
p3 <- imp_data %>%
dplyr::group_by(JobInvolvement) %>%
dplyr::summarise(attrition_rate = round((sum(if_else(Attrition == "Yes",1,0))/n()*100),2)) %>%
ggplot(aes(x = JobInvolvement, y = attrition_rate))+ geom_bar(stat = 'identity',fill = "light green") + ggtitle("Attrition Rate - Job Involvement") + theme(plot.title = element_text(size =10),axis.text.x = element_text(size =7,angle = 45, hjust = 1),axis.title.x=element_blank()) +geom_text(aes(label=attrition_rate), size = 2.5, position=position_dodge(width=0.2), vjust=-0.25)+ scale_y_continuous(limits = c(0, 40))
p4 <- imp_data %>%
dplyr::group_by(JobSatisfaction) %>%
dplyr::summarise(attrition_rate = round((sum(if_else(Attrition == "Yes",1,0))/n()*100),2)) %>%
ggplot(aes(x = JobSatisfaction, y = attrition_rate))+ geom_bar(stat = 'identity',fill = "light green") + ggtitle("Attrition Rate - Job Satisfaction") + theme(plot.title = element_text(size =10),axis.text.x = element_text(size =7,angle = 45, hjust = 1),axis.title.x=element_blank()) +geom_text(aes(label=attrition_rate), size = 2.5, position=position_dodge(width=0.2), vjust=-0.25)+ scale_y_continuous(limits = c(0, 30))
p5 <- imp_data %>%
dplyr::group_by(OverTime) %>%
dplyr::summarise(attrition_rate = round((sum(if_else(Attrition == "Yes",1,0))/n()*100),2)) %>%
ggplot(aes(x = OverTime, y = attrition_rate))+ geom_bar(stat = 'identity',fill = "light green") + ggtitle("Attrition Rate - Over Time") + theme(plot.title = element_text(size =10),axis.text.x = element_text(size =7,angle = 45, hjust = 1),axis.title.x=element_blank()) +geom_text(aes(label=attrition_rate), size = 2.5, position=position_dodge(width=0.2), vjust=-0.25)+ scale_y_continuous(limits = c(0, 35))
p6 <- imp_data %>%
dplyr::group_by(PerformanceRating) %>%
dplyr::summarise(attrition_rate = round((sum(if_else(Attrition == "Yes",1,0))/n()*100),2)) %>%
ggplot(aes(x = as.factor(PerformanceRating), y = attrition_rate))+ geom_bar(stat = 'identity',fill = "light green") + ggtitle("Attrition Rate - Performance Rating") + theme(plot.title = element_text(size =10),axis.text.x = element_text(size =7,angle = 45, hjust = 1),axis.title.x=element_blank()) +geom_text(aes(label=attrition_rate), size = 2.5, position=position_dodge(width=0.2), vjust=-0.25)+ scale_y_continuous(limits = c(0, 20))
grid.arrange(p1, p2, p3, p4, p5, p6, nrow = 2, ncol = 3)
## Warning: Removed 1 rows containing missing values (position_stack).
## Warning: Removed 1 rows containing missing values (geom_text).
p1 <- imp_data %>%
dplyr::group_by(Department) %>%
dplyr::summarise(attrition_rate = round((sum(if_else(Attrition == "Yes",1,0))/n()*100),2)) %>%
ggplot(aes(x = Department, y = attrition_rate))+ geom_bar(stat = 'identity',fill = "light green") + ggtitle("Attrition Rate - Department") + theme(plot.title = element_text(size =10),axis.text.x = element_text(size =7,angle = 45, hjust = 1),axis.title.x=element_blank()) +geom_text(aes(label=attrition_rate), size = 2.5, position=position_dodge(width=0.2), vjust=-0.25)
p2 <- imp_data %>%
dplyr::group_by(JobRole) %>%
dplyr::summarise(attrition_rate = round((sum(if_else(Attrition == "Yes",1,0))/n()*100),2)) %>%
ggplot(aes(x = JobRole, y = attrition_rate))+ geom_bar(stat = 'identity',fill = "Orange") + ggtitle("Job Role - Attrition Rate") + theme(plot.title = element_text(size =10),axis.text.x = element_text(size =7,angle = 45, hjust = 1),axis.title.x=element_blank()) +geom_text(aes(label=attrition_rate), size = 2.5, position=position_dodge(width=0.2), vjust=-0.25)
p2
grid.arrange(p1, p2, ncol = 2)
EDA - Observations - Bi/Multivariate Analysis 1) Younger employees within 25-35 years have a higher attrition rate. 2) We see a lower attrition rate when the Distance from home is within 10 units. The attrition rate increase post 10 units. 3) The attrition rate tends to be higher with employees who have worked with 5 to 7 companies. 4) We observe a peak in attrition rate at a monthly income approx. 2500. 5) We also see a peak in attrition rate when the employee is with the company for 0-2 years approx. 6) #Attrition rate is very high among employees from HR, Marketing and Technical backgrounds. 7) As expected, the attrition rate is very high among employees who have a bad work life balance. 8) Attrition rate is higher among people who travel frequently. 9) Its also higher among employees who have a low environment satisfaction, low job involvement and low job satisfaction. 10) The attrition rate is almost 30% among employees who work over time. 11) Sales department have the highest attrition at 21% whereas Sales Representatives have the highest attrition at 45%.
set.seed(1234)
splitPerc = .70
trainIndices = sample(1:dim(classification_data)[1],round(splitPerc * dim(classification_data)[1]))
train = classification_data[trainIndices,]
test = classification_data[-trainIndices,]
# 50-50 SMOTE down sampling
train_down_sampled <- SMOTE(form = Attrition~.,data = train, k = 10, perc.over = 100)
# ROSE Down sampling
train_bal_r <- ROSE(Attrition ~ ., data = train)$data
train_sampled <- train_down_sampled
#Split train and test model
set.seed(1234)
model.naive <- naiveBayes(Attrition ~ ., data=train_sampled[,-c(1)])
attrition.pred.naive <- predict(model.naive, test[,-c(1)], type="class")
confusionMatrix(attrition.pred.naive, test$Attrition)
## Confusion Matrix and Statistics
##
## Reference
## Prediction No Yes
## No 130 11
## Yes 83 37
##
## Accuracy : 0.6398
## 95% CI : (0.5784, 0.6981)
## No Information Rate : 0.8161
## P-Value [Acc > NIR] : 1
##
## Kappa : 0.2411
##
## Mcnemar's Test P-Value : 2.423e-13
##
## Sensitivity : 0.6103
## Specificity : 0.7708
## Pos Pred Value : 0.9220
## Neg Pred Value : 0.3083
## Prevalence : 0.8161
## Detection Rate : 0.4981
## Detection Prevalence : 0.5402
## Balanced Accuracy : 0.6906
##
## 'Positive' Class : No
##
# Prepare training scheme
control <- trainControl(method="repeatedcv", number=10, repeats=3, classProbs = TRUE)
set.seed(1234)
# Train the model
model.naive.t <- train( Attrition ~ ., data=train_sampled[, -c(1)], method="naive_bayes", trControl=control)
attrition.pred.naive.t <- predict(model.naive.t, test[-c(1)])
confusionMatrix(attrition.pred.naive.t, test$Attrition)
## Confusion Matrix and Statistics
##
## Reference
## Prediction No Yes
## No 124 13
## Yes 89 35
##
## Accuracy : 0.6092
## 95% CI : (0.5471, 0.6688)
## No Information Rate : 0.8161
## P-Value [Acc > NIR] : 1
##
## Kappa : 0.193
##
## Mcnemar's Test P-Value : 1.118e-13
##
## Sensitivity : 0.5822
## Specificity : 0.7292
## Pos Pred Value : 0.9051
## Neg Pred Value : 0.2823
## Prevalence : 0.8161
## Detection Rate : 0.4751
## Detection Prevalence : 0.5249
## Balanced Accuracy : 0.6557
##
## 'Positive' Class : No
##
# Top 10 Glmnet predictor ranking
importance.naive.t <- varImp(model.naive.t, scale=FALSE)
rank.naive.t <- importance.naive.t$importance
write.csv(rank.naive.t, "rank.naive.t.csv")
rank.naive.t <- read.csv("rank.naive.t.csv", header=TRUE)
colnames(rank.naive.t) <- c("Predictors", "Importance")
rank.naive.t <- rank.naive.t[order(rank.naive.t$Importance, decreasing = TRUE),]
ggplot(rank.naive.t[1:20,], aes(x=reorder(Predictors, Importance),y=Importance)) + geom_bar(stat = "identity") + coord_flip() + labs(title="Importance of Predictors", x="Predictors", y="Importance") +theme(axis.text.x=element_text(hjust=0.5, vjust=0.5, size = 12))+theme(axis.text.y=element_text(size = 12))
set.seed(1234)
nb_model1 <- Attrition ~ OverTime + MaritalStatus + StockOptionLevel + +MonthlyIncome + TotalWorkingYears + JobLevel + YearsAtCompany + YearsInCurrentRole
model.naive.ct <- naiveBayes(nb_model1, data=train_sampled)
attrition.pred.naive.ct <- predict(model.naive.ct, test, type="class")
confusionMatrix(attrition.pred.naive.ct, test$Attrition)
## Confusion Matrix and Statistics
##
## Reference
## Prediction No Yes
## No 115 11
## Yes 98 37
##
## Accuracy : 0.5824
## 95% CI : (0.52, 0.6429)
## No Information Rate : 0.8161
## P-Value [Acc > NIR] : 1
##
## Kappa : 0.1826
##
## Mcnemar's Test P-Value : <2e-16
##
## Sensitivity : 0.5399
## Specificity : 0.7708
## Pos Pred Value : 0.9127
## Neg Pred Value : 0.2741
## Prevalence : 0.8161
## Detection Rate : 0.4406
## Detection Prevalence : 0.4828
## Balanced Accuracy : 0.6554
##
## 'Positive' Class : No
##
set.seed(1234)
nb_model2 <- Attrition ~ OverTime + MaritalStatus + MonthlyIncome + TotalWorkingYears + JobLevel + YearsAtCompany + JobSatisfaction + DistanceFromHome + NumCompaniesWorked + WorkLifeBalance + JobRole + StockOptionLevel
model.naive.ct <- naiveBayes(nb_model2, data=train_sampled)
attrition.pred.naive.ct <- predict(model.naive.ct, test, type="class")
confusionMatrix(attrition.pred.naive.ct, test$Attrition)
## Confusion Matrix and Statistics
##
## Reference
## Prediction No Yes
## No 129 10
## Yes 84 38
##
## Accuracy : 0.6398
## 95% CI : (0.5784, 0.6981)
## No Information Rate : 0.8161
## P-Value [Acc > NIR] : 1
##
## Kappa : 0.2488
##
## Mcnemar's Test P-Value : 5.098e-14
##
## Sensitivity : 0.6056
## Specificity : 0.7917
## Pos Pred Value : 0.9281
## Neg Pred Value : 0.3115
## Prevalence : 0.8161
## Detection Rate : 0.4943
## Detection Prevalence : 0.5326
## Balanced Accuracy : 0.6987
##
## 'Positive' Class : No
##
set.seed(1234)
Attrition ~ OverTime + MonthlyIncome + MaritalStatus + StockOptionLevel + JobSatisfaction + DistanceFromHome + YearsWithCurrManager
## Attrition ~ OverTime + MonthlyIncome + MaritalStatus + StockOptionLevel +
## JobSatisfaction + DistanceFromHome + YearsWithCurrManager
nb_model3 <- Attrition ~ OverTime + MonthlyIncome + MaritalStatus + StockOptionLevel + JobSatisfaction + DistanceFromHome + YearsWithCurrManager
model.naive.ct <- naiveBayes(nb_model3, data=train_sampled)
attrition.pred.naive.ct <- predict(model.naive.ct, test, type="class")
confusionMatrix(attrition.pred.naive.ct, test$Attrition)
## Confusion Matrix and Statistics
##
## Reference
## Prediction No Yes
## No 146 14
## Yes 67 34
##
## Accuracy : 0.6897
## 95% CI : (0.6297, 0.7453)
## No Information Rate : 0.8161
## P-Value [Acc > NIR] : 1
##
## Kappa : 0.2758
##
## Mcnemar's Test P-Value : 7.569e-09
##
## Sensitivity : 0.6854
## Specificity : 0.7083
## Pos Pred Value : 0.9125
## Neg Pred Value : 0.3366
## Prevalence : 0.8161
## Detection Rate : 0.5594
## Detection Prevalence : 0.6130
## Balanced Accuracy : 0.6969
##
## 'Positive' Class : No
##
From the above Naive Bayes Models, Model 3 with predictors (OverTime + MonthlyIncome + MaritalStatus + StockOptionLevel + JobSatisfaction + DistanceFromHome + YearsWithCurrManager) gave better sensitiviy(69%) and specificity(72%)
#Prepare training scheme
control <- trainControl(method="repeatedcv", number=10, repeats=3)
set.seed(1234)
#Train the model
model.knn <- train( Attrition ~ ., data=train_sampled, method="knn", trControl=control)
attrition.pred.knn <- predict(model.knn, test)
confusionMatrix(attrition.pred.knn, test$Attrition)
## Confusion Matrix and Statistics
##
## Reference
## Prediction No Yes
## No 124 21
## Yes 89 27
##
## Accuracy : 0.5785
## 95% CI : (0.5161, 0.6392)
## No Information Rate : 0.8161
## P-Value [Acc > NIR] : 1
##
## Kappa : 0.0934
##
## Mcnemar's Test P-Value : 1.679e-10
##
## Sensitivity : 0.5822
## Specificity : 0.5625
## Pos Pred Value : 0.8552
## Neg Pred Value : 0.2328
## Prevalence : 0.8161
## Detection Rate : 0.4751
## Detection Prevalence : 0.5556
## Balanced Accuracy : 0.5723
##
## 'Positive' Class : No
##
#Top 10 predictor ranking
importance.knn <- varImp(model.knn, scale=FALSE)
rank.knn <- importance.knn$importance
write.csv(rank.knn, "rank.knn.csv")
rank.knn <- read.csv("rank.knn.csv", header=TRUE)
colnames(rank.knn) <- c("Predictors", "Importance")
rank.knn <- rank.knn[order(rank.knn$Importance, decreasing = TRUE),]
ggplot(rank.knn[1:20,], aes(x=reorder(Predictors, Importance),y=Importance)) + geom_bar(stat = "identity") + coord_flip() + labs(title="Importance of Predictors", x="Predictors", y="Importance") +theme(axis.text.x=element_text(hjust=0.5, vjust=0.5, size = 12))+theme(axis.text.y=element_text(size = 12))
#Create training and test data
set.seed(1234)
knn_model1 <- Attrition ~ OverTime + MaritalStatus + StockOptionLevel + +MonthlyIncome + TotalWorkingYears + JobLevel + YearsAtCompany + YearsInCurrentRole
# k-Nearest Neighbours
fit_knn <- train(knn_model1,train_sampled,method = 'knn',trControl = trainControl(method = 'repeatedcv',number = 3))
fit_knn
## k-Nearest Neighbors
##
## 368 samples
## 8 predictor
## 2 classes: 'No', 'Yes'
##
## No pre-processing
## Resampling: Cross-Validated (3 fold, repeated 1 times)
## Summary of sample sizes: 246, 246, 244
## Resampling results across tuning parameters:
##
## k Accuracy Kappa
## 5 0.5572008 0.1144016
## 7 0.5653534 0.1307069
## 9 0.5601534 0.1203067
##
## Accuracy was used to select the optimal model using the largest value.
## The final value used for the model was k = 7.
Predictions_knn <- predict(fit_knn,test)
#Set the training control method
trainMethod <- trainControl(method = "repeatedcv", number = 25, repeats = 5, summaryFunction = twoClassSummary, classProbs = TRUE)
fit.knn <- train(knn_model1, data = train_sampled, method = "knn", metric = "Spec", trControl = trainMethod, preProcess = c("center","scale"), tuneLength = 31)
pred.knn <- predict(fit.knn, test)
summary(pred.knn)
## No Yes
## 163 98
confusionMatrix(pred.knn, test$Attrition)
## Confusion Matrix and Statistics
##
## Reference
## Prediction No Yes
## No 145 18
## Yes 68 30
##
## Accuracy : 0.6705
## 95% CI : (0.6099, 0.7272)
## No Information Rate : 0.8161
## P-Value [Acc > NIR] : 1
##
## Kappa : 0.2179
##
## Mcnemar's Test P-Value : 1.265e-07
##
## Sensitivity : 0.6808
## Specificity : 0.6250
## Pos Pred Value : 0.8896
## Neg Pred Value : 0.3061
## Prevalence : 0.8161
## Detection Rate : 0.5556
## Detection Prevalence : 0.6245
## Balanced Accuracy : 0.6529
##
## 'Positive' Class : No
##
#Create training and test data
set.seed(1234)
knn_model2 <- Attrition ~ OverTime + MaritalStatus + MonthlyIncome + TotalWorkingYears + JobLevel + YearsAtCompany + JobSatisfaction + DistanceFromHome + NumCompaniesWorked + WorkLifeBalance + JobRole + StockOptionLevel
# k-Nearest Neighbours
fit_knn <- train(knn_model2,train_sampled,method = 'knn',trControl = trainControl(method = 'repeatedcv',number = 3))
fit_knn
## k-Nearest Neighbors
##
## 368 samples
## 12 predictor
## 2 classes: 'No', 'Yes'
##
## No pre-processing
## Resampling: Cross-Validated (3 fold, repeated 1 times)
## Summary of sample sizes: 246, 246, 244
## Resampling results across tuning parameters:
##
## k Accuracy Kappa
## 5 0.5627093 0.1254186
## 7 0.5653534 0.1307069
## 9 0.5601534 0.1203067
##
## Accuracy was used to select the optimal model using the largest value.
## The final value used for the model was k = 7.
Predictions_knn <- predict(fit_knn,test)
#Set the training control method
trainMethod <- trainControl(method = "repeatedcv", number = 25, repeats = 5, summaryFunction = twoClassSummary, classProbs = TRUE)
fit.knn <- train(knn_model2, data = train_sampled, method = "knn", metric = "Spec", trControl = trainMethod, preProcess = c("center","scale"), tuneLength = 31)
pred.knn <- predict(fit.knn, test)
summary(pred.knn)
## No Yes
## 178 83
confusionMatrix(pred.knn, test$Attrition)
## Confusion Matrix and Statistics
##
## Reference
## Prediction No Yes
## No 160 18
## Yes 53 30
##
## Accuracy : 0.728
## 95% CI : (0.6697, 0.781)
## No Information Rate : 0.8161
## P-Value [Acc > NIR] : 0.9998
##
## Kappa : 0.2933
##
## Mcnemar's Test P-Value : 5.459e-05
##
## Sensitivity : 0.7512
## Specificity : 0.6250
## Pos Pred Value : 0.8989
## Neg Pred Value : 0.3614
## Prevalence : 0.8161
## Detection Rate : 0.6130
## Detection Prevalence : 0.6820
## Balanced Accuracy : 0.6881
##
## 'Positive' Class : No
##
#Create training and test data
set.seed(1234)
knn_model3 <- Attrition ~ OverTime + MonthlyIncome + MaritalStatus + StockOptionLevel + JobSatisfaction + DistanceFromHome + YearsWithCurrManager
# k-Nearest Neighbours
fit_knn <- train(knn_model3,train_sampled,method = 'knn',trControl = trainControl(method = 'repeatedcv',number = 3))
fit_knn
## k-Nearest Neighbors
##
## 368 samples
## 7 predictor
## 2 classes: 'No', 'Yes'
##
## No pre-processing
## Resampling: Cross-Validated (3 fold, repeated 1 times)
## Summary of sample sizes: 246, 246, 244
## Resampling results across tuning parameters:
##
## k Accuracy Kappa
## 5 0.5653975 0.1307950
## 7 0.5653534 0.1307069
## 9 0.5601534 0.1203067
##
## Accuracy was used to select the optimal model using the largest value.
## The final value used for the model was k = 5.
Predictions_knn <- predict(fit_knn,test)
#Set the training control method
trainMethod <- trainControl(method = "repeatedcv", number = 25, repeats = 5, summaryFunction = twoClassSummary, classProbs = TRUE)
fit.knn <- train(knn_model3, data = train_sampled, method = "knn", metric = "Spec", trControl = trainMethod, preProcess = c("center","scale"), tuneLength = 31)
pred.knn <- predict(fit.knn, test)
summary(pred.knn)
## No Yes
## 180 81
confusionMatrix(pred.knn, test$Attrition)
## Confusion Matrix and Statistics
##
## Reference
## Prediction No Yes
## No 161 19
## Yes 52 29
##
## Accuracy : 0.728
## 95% CI : (0.6697, 0.781)
## No Information Rate : 0.8161
## P-Value [Acc > NIR] : 0.999825
##
## Kappa : 0.2843
##
## Mcnemar's Test P-Value : 0.000146
##
## Sensitivity : 0.7559
## Specificity : 0.6042
## Pos Pred Value : 0.8944
## Neg Pred Value : 0.3580
## Prevalence : 0.8161
## Detection Rate : 0.6169
## Detection Prevalence : 0.6897
## Balanced Accuracy : 0.6800
##
## 'Positive' Class : No
##
From the above KNN, Model 1 with predictors (OverTime + MaritalStatus + StockOptionLevel + +MonthlyIncome + TotalWorkingYears + JobLevel + YearsAtCompany + YearsInCurrentRole) gave better sensitivity(67%) and specificity(64%).
But the Naive Bayes Model3 has better sensitivity(69%) and specificity(72%). So for further predictions we will use Naive Bayes Model 3.
set.seed(1234)
nb_model3 <- Attrition ~ OverTime + MonthlyIncome + MaritalStatus + StockOptionLevel + JobSatisfaction + DistanceFromHome + YearsWithCurrManager
test <- no_attrition_data
model.naive.ct <- naiveBayes(nb_model3, data=raw_data)
attrition.pred.naive.ct <- predict(model.naive.ct, test, type="class")
attrition_prediction <- data.frame(ID=test$ID,Attrition=attrition.pred.naive.ct)
path_out = '/Users/lijjumathew/Code/Github/MSDS-Doing-Datascience-Project2/Results/'
fileName = paste(path_out, 'Case2PredictionsMathewAttrition..csv',sep = '')
#write.csv(attrition_prediction, fileName, row.names = F)
#Create training and test data
set.seed(1234)
# Converting Character Variable into Numeric Variables
conv.attr.reg <- function(data, ...) {
data$AttritionNum[data$Attrition== "Yes"] <-1
data$AttritionNum[data$Attrition== "No"] <-0
data$BusinessTravelNum[data$BusinessTravel== "Non-Travel"] <-1
data$BusinessTravelNum[data$BusinessTravel== "Travel_Rarely"] <-2
data$BusinessTravelNum[data$BusinessTravel== "Travel_Frequently"] <-3
data$DepartmentNum[data$Department== "Human Resources"] <-1
data$DepartmentNum[data$Department== "Research & Development"] <-2
data$DepartmentNum[data$Department== "Sales"] <-3
data$EducationFieldNum[data$EducationField== "Human Resources"] <-1
data$EducationFieldNum[data$EducationField== "Life Sciences"] <-2
data$EducationFieldNum[data$EducationField== "Marketing"] <-3
data$EducationFieldNum[data$EducationField== "Medical"] <-4
data$EducationFieldNum[data$EducationField== "Other"] <-5
data$EducationFieldNum[data$EducationField== "Technical Degree"] <-6
data$GenderNum[data$Gender== "Female"] <-1
data$GenderNum[data$Gender== "Male"] <-2
data$JobRoleNum[data$JobRole== "Human Resources"] <-1
data$JobRoleNum[data$JobRole== "Healthcare Representative"] <-2
data$JobRoleNum[data$JobRole== "Laboratory Technician"] <-3
data$JobRoleNum[data$JobRole== "Manager"] <-4
data$JobRoleNum[data$JobRole== "Manufacturing Director"] <-5
data$JobRoleNum[data$JobRole== "Research Director"] <-6
data$JobRoleNum[data$JobRole== "Research Scientist"] <-7
data$JobRoleNum[data$JobRole== "Sales Executive"] <-8
data$JobRoleNum[data$JobRole== "Sales Representative"] <-9
data$MaritalStatusNum[data$MaritalStatus== "Divorced"] <-1
data$MaritalStatusNum[data$MaritalStatus== "Married"] <-2
data$MaritalStatusNum[data$MaritalStatus== "Single"] <-3
data$OverTimeNum[data$OverTime== "Yes"] <-1
data$OverTimeNum[data$OverTime== "No"] <-0
data$JobInvolvementNum[data$JobInvolvement== "Low"] <-1
data$JobInvolvementNum[data$JobInvolvement== "Medium"] <-2
data$JobInvolvementNum[data$JobInvolvement== "High"] <-3
data$JobInvolvementNum[data$JobInvolvement== "Very High"] <-4
data$JobSatisfactionNum[data$JobSatisfaction== "Low"] <-1
data$JobSatisfactionNum[data$JobSatisfaction== "Medium"] <-2
data$JobSatisfactionNum[data$JobSatisfaction== "High"] <-3
data$JobSatisfactionNum[data$JobSatisfaction== "Very High"] <-4
data$PerformanceRatingNum[data$PerformanceRating== "Low"] <-1
data$PerformanceRatingNum[data$PerformanceRating== "Good"] <-2
data$PerformanceRatingNum[data$PerformanceRating== "Excellent"] <-3
data$PerformanceRatingNum[data$PerformanceRating== "Outstanding"] <-4
data$RelationshipSatisfactionNum[data$RelationshipSatisfaction== "Low"] <-1
data$RelationshipSatisfactionNum[data$RelationshipSatisfaction== "Medium"] <-2
data$RelationshipSatisfactionNum[data$RelationshipSatisfaction== "High"] <-3
data$RelationshipSatisfactionNum[data$RelationshipSatisfaction== "Very High"] <-4
data$WorkLifeBalanceNum[data$WorkLifeBalance== "Bad"] <-1
data$WorkLifeBalanceNum[data$WorkLifeBalance== "Good"] <-2
data$WorkLifeBalanceNum[data$WorkLifeBalance== "Better"] <-3
data$WorkLifeBalanceNum[data$WorkLifeBalance== "Best"] <-4
drops <- c("Attrition", "BusinessTravel", "Department","EducationField","Gender","JobRole","MaritalStatus","OverTime","JobInvolvement", "JobSatisfaction", "PerformanceRating", "RelationshipSatisfaction","WorkLifeBalance")
data <- data[ , !(names(data) %in% drops)]
data
}
corr_data <- conv.attr.reg(classification_data)
cor.test(corr_data[,"AttritionNum"], corr_data[,"BusinessTravelNum"])
##
## Pearson's product-moment correlation
##
## data: corr_data[, "AttritionNum"] and corr_data[, "BusinessTravelNum"]
## t = 2.376, df = 868, p-value = 0.01772
## alternative hypothesis: true correlation is not equal to 0
## 95 percent confidence interval:
## 0.01399612 0.14607234
## sample estimates:
## cor
## 0.08038707
cor.test(corr_data[,"AttritionNum"], corr_data[,"DailyRate"])
##
## Pearson's product-moment correlation
##
## data: corr_data[, "AttritionNum"] and corr_data[, "DailyRate"]
## t = -0.99618, df = 868, p-value = 0.3194
## alternative hypothesis: true correlation is not equal to 0
## 95 percent confidence interval:
## -0.10003417 0.03274616
## sample estimates:
## cor
## -0.03379312
cor.test(corr_data[,"AttritionNum"], corr_data[,"DistanceFromHome"])
##
## Pearson's product-moment correlation
##
## data: corr_data[, "AttritionNum"] and corr_data[, "DistanceFromHome"]
## t = 2.577, df = 868, p-value = 0.01013
## alternative hypothesis: true correlation is not equal to 0
## 95 percent confidence interval:
## 0.02079097 0.15271755
## sample estimates:
## cor
## 0.08713629
cor.test(corr_data[,"AttritionNum"], corr_data[,"EducationFieldNum"])
##
## Pearson's product-moment correlation
##
## data: corr_data[, "AttritionNum"] and corr_data[, "EducationFieldNum"]
## t = 0.77046, df = 868, p-value = 0.4412
## alternative hypothesis: true correlation is not equal to 0
## 95 percent confidence interval:
## -0.04039377 0.09244725
## sample estimates:
## cor
## 0.02614215
cor.test(corr_data[,"AttritionNum"], corr_data[,"GenderNum"])
##
## Pearson's product-moment correlation
##
## data: corr_data[, "AttritionNum"] and corr_data[, "GenderNum"]
## t = 0.74416, df = 868, p-value = 0.457
## alternative hypothesis: true correlation is not equal to 0
## 95 percent confidence interval:
## -0.04128468 0.09156240
## sample estimates:
## cor
## 0.02525034
cor.test(corr_data[,"AttritionNum"], corr_data[,"HourlyRate"])
##
## Pearson's product-moment correlation
##
## data: corr_data[, "AttritionNum"] and corr_data[, "HourlyRate"]
## t = 1.0777, df = 868, p-value = 0.2815
## alternative hypothesis: true correlation is not equal to 0
## 95 percent confidence interval:
## -0.0299844 0.1027702
## sample estimates:
## cor
## 0.03655418
cor.test(corr_data[,"AttritionNum"], corr_data[,"MaritalStatusNum"])
##
## Pearson's product-moment correlation
##
## data: corr_data[, "AttritionNum"] and corr_data[, "MaritalStatusNum"]
## t = 5.9205, df = 868, p-value = 4.621e-09
## alternative hypothesis: true correlation is not equal to 0
## 95 percent confidence interval:
## 0.1322814 0.2600751
## sample estimates:
## cor
## 0.197015
cor.test(corr_data[,"AttritionNum"], corr_data[,"MonthlyIncome"])
##
## Pearson's product-moment correlation
##
## data: corr_data[, "AttritionNum"] and corr_data[, "MonthlyIncome"]
## t = -4.6199, df = 868, p-value = 4.422e-06
## alternative hypothesis: true correlation is not equal to 0
## 95 percent confidence interval:
## -0.21912446 -0.08936942
## sample estimates:
## cor
## -0.154915
cor.test(corr_data[,"AttritionNum"], corr_data[,"MonthlyRate"])
##
## Pearson's product-moment correlation
##
## data: corr_data[, "AttritionNum"] and corr_data[, "MonthlyRate"]
## t = -1.2749, df = 868, p-value = 0.2027
## alternative hypothesis: true correlation is not equal to 0
## 95 percent confidence interval:
## -0.10938360 0.02330051
## sample estimates:
## cor
## -0.04323217
cor.test(corr_data[,"AttritionNum"], corr_data[,"NumCompaniesWorked"])
##
## Pearson's product-moment correlation
##
## data: corr_data[, "AttritionNum"] and corr_data[, "NumCompaniesWorked"]
## t = 1.8011, df = 868, p-value = 0.07204
## alternative hypothesis: true correlation is not equal to 0
## 95 percent confidence interval:
## -0.005469025 0.126969672
## sample estimates:
## cor
## 0.06101889
cor.test(corr_data[,"AttritionNum"], corr_data[,"OverTimeNum"])
##
## Pearson's product-moment correlation
##
## data: corr_data[, "AttritionNum"] and corr_data[, "OverTimeNum"]
## t = 8.3288, df = 868, p-value = 3.164e-16
## alternative hypothesis: true correlation is not equal to 0
## 95 percent confidence interval:
## 0.2093563 0.3324905
## sample estimates:
## cor
## 0.2720366
cor.test(corr_data[,"AttritionNum"], corr_data[,"PercentSalaryHike"])
##
## Pearson's product-moment correlation
##
## data: corr_data[, "AttritionNum"] and corr_data[, "PercentSalaryHike"]
## t = 0.45165, df = 868, p-value = 0.6516
## alternative hypothesis: true correlation is not equal to 0
## 95 percent confidence interval:
## -0.05118981 0.08171055
## sample estimates:
## cor
## 0.01532807
cor.test(corr_data[,"AttritionNum"], corr_data[,"TotalWorkingYears"])
##
## Pearson's product-moment correlation
##
## data: corr_data[, "AttritionNum"] and corr_data[, "TotalWorkingYears"]
## t = -4.9965, df = 868, p-value = 7.059e-07
## alternative hypothesis: true correlation is not equal to 0
## 95 percent confidence interval:
## -0.2311035 -0.1018725
## sample estimates:
## cor
## -0.1672061
cor.test(corr_data[,"AttritionNum"], corr_data[,"YearsAtCompany"])
##
## Pearson's product-moment correlation
##
## data: corr_data[, "AttritionNum"] and corr_data[, "YearsAtCompany"]
## t = -3.8252, df = 868, p-value = 0.00014
## alternative hypothesis: true correlation is not equal to 0
## 95 percent confidence interval:
## -0.19356333 -0.06282598
## sample estimates:
## cor
## -0.1287541
cor.test(corr_data[,"AttritionNum"], corr_data[,"YearsInCurrentRole"])
##
## Pearson's product-moment correlation
##
## data: corr_data[, "AttritionNum"] and corr_data[, "YearsInCurrentRole"]
## t = -4.6596, df = 868, p-value = 3.665e-06
## alternative hypothesis: true correlation is not equal to 0
## 95 percent confidence interval:
## -0.22039310 -0.09069163
## sample estimates:
## cor
## -0.1562157
cor.test(corr_data[,"AttritionNum"], corr_data[,"YearsSinceLastPromotion"])
##
## Pearson's product-moment correlation
##
## data: corr_data[, "AttritionNum"] and corr_data[, "YearsSinceLastPromotion"]
## t = -0.13474, df = 868, p-value = 0.8928
## alternative hypothesis: true correlation is not equal to 0
## 95 percent confidence interval:
## -0.07101747 0.06191123
## sample estimates:
## cor
## -0.004573321
cor.test(corr_data[,"AttritionNum"], corr_data[,"YearsWithCurrManager"])
##
## Pearson's product-moment correlation
##
## data: corr_data[, "AttritionNum"] and corr_data[, "YearsWithCurrManager"]
## t = -4.3718, df = 868, p-value = 1.381e-05
## alternative hypothesis: true correlation is not equal to 0
## 95 percent confidence interval:
## -0.2111876 -0.0811078
## sample estimates:
## cor
## -0.1467822
cor.test(corr_data[,"AttritionNum"], corr_data[,"YearsWithCurrManager"])
##
## Pearson's product-moment correlation
##
## data: corr_data[, "AttritionNum"] and corr_data[, "YearsWithCurrManager"]
## t = -4.3718, df = 868, p-value = 1.381e-05
## alternative hypothesis: true correlation is not equal to 0
## 95 percent confidence interval:
## -0.2111876 -0.0811078
## sample estimates:
## cor
## -0.1467822
cor.test(corr_data[,"AttritionNum"], corr_data[,"JobRoleNum"])
##
## Pearson's product-moment correlation
##
## data: corr_data[, "AttritionNum"] and corr_data[, "JobRoleNum"]
## t = 2.5546, df = 868, p-value = 0.0108
## alternative hypothesis: true correlation is not equal to 0
## 95 percent confidence interval:
## 0.02003487 0.15197869
## sample estimates:
## cor
## 0.08638557
cor.test(corr_data[,"AttritionNum"], corr_data[,"DepartmentNum"])
##
## Pearson's product-moment correlation
##
## data: corr_data[, "AttritionNum"] and corr_data[, "DepartmentNum"]
## t = 2.5739, df = 868, p-value = 0.01022
## alternative hypothesis: true correlation is not equal to 0
## 95 percent confidence interval:
## 0.02068592 0.15261490
## sample estimates:
## cor
## 0.08703199
cor.test(corr_data[,"AttritionNum"], corr_data[,"JobLevel"])
##
## Pearson's product-moment correlation
##
## data: corr_data[, "AttritionNum"] and corr_data[, "JobLevel"]
## t = -4.8409, df = 868, p-value = 1.53e-06
## alternative hypothesis: true correlation is not equal to 0
## 95 percent confidence interval:
## -0.22616491 -0.09671294
## sample estimates:
## cor
## -0.1621364
cor.test(corr_data[,"AttritionNum"], corr_data[,"JobRoleNum"])
##
## Pearson's product-moment correlation
##
## data: corr_data[, "AttritionNum"] and corr_data[, "JobRoleNum"]
## t = 2.5546, df = 868, p-value = 0.0108
## alternative hypothesis: true correlation is not equal to 0
## 95 percent confidence interval:
## 0.02003487 0.15197869
## sample estimates:
## cor
## 0.08638557
cor.test(corr_data[,"AttritionNum"], corr_data[,"JobInvolvementNum"])
##
## Pearson's product-moment correlation
##
## data: corr_data[, "AttritionNum"] and corr_data[, "JobInvolvementNum"]
## t = -5.633, df = 868, p-value = 2.393e-08
## alternative hypothesis: true correlation is not equal to 0
## 95 percent confidence interval:
## -0.2511246 -0.1228612
## sample estimates:
## cor
## -0.1877934
cor.test(corr_data[,"AttritionNum"], corr_data[,"JobSatisfactionNum"])
##
## Pearson's product-moment correlation
##
## data: corr_data[, "AttritionNum"] and corr_data[, "JobSatisfactionNum"]
## t = -3.1862, df = 868, p-value = 0.001493
## alternative hypothesis: true correlation is not equal to 0
## 95 percent confidence interval:
## -0.17275210 -0.04135071
## sample estimates:
## cor
## -0.1075209
cor.test(corr_data[,"AttritionNum"], corr_data[,"PerformanceRatingNum"])
##
## Pearson's product-moment correlation
##
## data: corr_data[, "AttritionNum"] and corr_data[, "PerformanceRatingNum"]
## t = 0.45182, df = 868, p-value = 0.6515
## alternative hypothesis: true correlation is not equal to 0
## 95 percent confidence interval:
## -0.05118406 0.08171629
## sample estimates:
## cor
## 0.01533384
cor.test(corr_data[,"AttritionNum"], corr_data[,"RelationshipSatisfactionNum"])
##
## Pearson's product-moment correlation
##
## data: corr_data[, "AttritionNum"] and corr_data[, "RelationshipSatisfactionNum"]
## t = -1.169, df = 868, p-value = 0.2427
## alternative hypothesis: true correlation is not equal to 0
## 95 percent confidence interval:
## -0.10583346 0.02688998
## sample estimates:
## cor
## -0.03964661
cor.test(corr_data[,"AttritionNum"], corr_data[,"WorkLifeBalanceNum"])
##
## Pearson's product-moment correlation
##
## data: corr_data[, "AttritionNum"] and corr_data[, "WorkLifeBalanceNum"]
## t = -2.6561, df = 868, p-value = 0.00805
## alternative hypothesis: true correlation is not equal to 0
## 95 percent confidence interval:
## -0.15532845 -0.02346401
## sample estimates:
## cor
## -0.08978971
train_sampled_reg <- conv.attr.reg(train_sampled)
full_model <-AttritionNum~.
fit1 <- lm(full_model, data = train_sampled_reg)
summary(fit1)
##
## Call:
## lm(formula = full_model, data = train_sampled_reg)
##
## Residuals:
## Min 1Q Median 3Q Max
## -0.85806 -0.29074 0.01326 0.26600 0.99204
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 1.024e+00 3.452e-01 2.966 0.00323 **
## ID 1.024e-04 9.317e-05 1.099 0.27256
## Age -8.879e-03 3.313e-03 -2.680 0.00772 **
## DailyRate -4.430e-05 6.014e-05 -0.737 0.46185
## DistanceFromHome 5.619e-03 2.769e-03 2.029 0.04325 *
## Education 1.966e-02 2.318e-02 0.848 0.39690
## EnvironmentSatisfaction -4.086e-02 1.985e-02 -2.058 0.04036 *
## HourlyRate 2.947e-06 1.138e-03 0.003 0.99794
## JobLevel -9.843e-03 7.594e-02 -0.130 0.89695
## MonthlyIncome -7.731e-06 1.748e-05 -0.442 0.65866
## MonthlyRate -2.318e-06 3.243e-06 -0.715 0.47523
## NumCompaniesWorked 1.814e-02 9.926e-03 1.827 0.06853 .
## PercentSalaryHike 3.602e-03 8.068e-03 0.446 0.65558
## StockOptionLevel -8.701e-02 3.468e-02 -2.509 0.01259 *
## TotalWorkingYears -4.856e-03 6.539e-03 -0.743 0.45819
## TrainingTimesLastYear -5.759e-02 1.884e-02 -3.057 0.00241 **
## YearsAtCompany 9.942e-03 7.958e-03 1.249 0.21239
## YearsInCurrentRole -7.901e-03 1.218e-02 -0.649 0.51703
## YearsSinceLastPromotion 8.550e-03 9.092e-03 0.940 0.34768
## YearsWithCurrManager -1.281e-02 1.177e-02 -1.088 0.27734
## BusinessTravelNum 9.043e-02 3.820e-02 2.367 0.01849 *
## DepartmentNum 1.198e-01 5.128e-02 2.336 0.02007 *
## EducationFieldNum 1.714e-02 1.618e-02 1.059 0.29040
## GenderNum 2.841e-02 4.366e-02 0.651 0.51570
## JobRoleNum -4.013e-03 1.091e-02 -0.368 0.71320
## MaritalStatusNum 4.514e-02 3.825e-02 1.180 0.23876
## OverTimeNum 2.998e-01 4.514e-02 6.642 1.25e-10 ***
## JobInvolvementNum -1.266e-01 2.762e-02 -4.585 6.42e-06 ***
## JobSatisfactionNum -3.829e-02 1.986e-02 -1.928 0.05468 .
## PerformanceRatingNum -3.884e-02 8.050e-02 -0.483 0.62976
## RelationshipSatisfactionNum -6.298e-03 1.889e-02 -0.333 0.73903
## WorkLifeBalanceNum -6.345e-02 3.049e-02 -2.081 0.03820 *
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 0.3983 on 336 degrees of freedom
## Multiple R-squared: 0.4206, Adjusted R-squared: 0.3672
## F-statistic: 7.87 on 31 and 336 DF, p-value: < 2.2e-16
With Full Model following predictors turned to be statistically significant. Age, DistanceFromHome, EnvironmentSatisfaction, NumCompaniesWorked, StockOptionLevel, TrainingTimesLastYear, BusinessTravelNum, DepartmentNum, MaritalStatusNum, OverTimeNum, JobInvolvementNum, JobSatisfactionNum, WorkLifeBalanceNum
Model1 <-AttritionNum~ Age + DistanceFromHome + EnvironmentSatisfaction + NumCompaniesWorked + StockOptionLevel + TrainingTimesLastYear + BusinessTravelNum + DepartmentNum + MaritalStatusNum + OverTimeNum + JobInvolvementNum + JobSatisfactionNum + WorkLifeBalanceNum
fit1 <- lm(Model1, data = train_sampled_reg)
summary(fit1)
##
## Call:
## lm(formula = Model1, data = train_sampled_reg)
##
## Residuals:
## Min 1Q Median 3Q Max
## -0.84122 -0.28684 -0.01888 0.26245 1.08348
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 1.122969 0.225203 4.986 9.65e-07 ***
## Age -0.012333 0.002450 -5.034 7.68e-07 ***
## DistanceFromHome 0.006133 0.002712 2.261 0.02435 *
## EnvironmentSatisfaction -0.035423 0.019012 -1.863 0.06327 .
## NumCompaniesWorked 0.013191 0.008369 1.576 0.11586
## StockOptionLevel -0.080171 0.033269 -2.410 0.01647 *
## TrainingTimesLastYear -0.047790 0.017976 -2.659 0.00820 **
## BusinessTravelNum 0.089535 0.037360 2.397 0.01707 *
## DepartmentNum 0.108082 0.041434 2.609 0.00948 **
## MaritalStatusNum 0.045473 0.037300 1.219 0.22361
## OverTimeNum 0.304671 0.043731 6.967 1.59e-11 ***
## JobInvolvementNum -0.133775 0.026956 -4.963 1.08e-06 ***
## JobSatisfactionNum -0.045894 0.018983 -2.418 0.01613 *
## WorkLifeBalanceNum -0.071976 0.029626 -2.430 0.01562 *
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 0.3977 on 354 degrees of freedom
## Multiple R-squared: 0.3914, Adjusted R-squared: 0.3691
## F-statistic: 17.51 on 13 and 354 DF, p-value: < 2.2e-16
The factors which increases attrition rate in employees are: (because of their positive coefficient in Model) 1) More Business Travel 2) Office Distance From Home 3) Worked in more number of companies 4) OverTime to complete work 5) More number in present company 6) If he is not promoted from long time.
The factors which decreases attrition rate in employees are: (because of their negative coefficient in Model) 1) Higher Age of Employee 2) Good Environment Satisfaction 3) Good Job Involvement 4) Good Job Satisfaction 5) Good Relationship Satisfaction 6) Good Work Life Balance 7) More number of years in Current Role 8) More number of years with Current Manager.
The factors which adversely affect Attrition Rate means to very high extent are: 1) More Business Travel 2) Office Distance From Home 3) Worked in more number of companies 4) Overtime at work 5) Good Environment Satisfaction 6) Good Job Involvement 7) Good Job Satisfaction
set.seed(1234)
splitPerc = .70
trainIndices = sample(1:dim(classification_data)[1],round(splitPerc * dim(classification_data)[1]))
train = classification_data[trainIndices,]
test = classification_data[-trainIndices,]
reg_train <- conv.attr.reg(train)
reg_test <- conv.attr.reg(test)
Model1 <-MonthlyIncome ~ .
fit1 <- lm(Model1, data = reg_train)
summary(fit1)
##
## Call:
## lm(formula = Model1, data = reg_train)
##
## Residuals:
## Min 1Q Median 3Q Max
## -5703.7 -786.2 8.1 728.8 4583.0
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 2.970e+02 8.934e+02 0.332 0.739685
## ID -3.058e-01 2.197e-01 -1.392 0.164581
## Age -4.550e+00 8.432e+00 -0.540 0.589679
## DailyRate 1.049e-01 1.347e-01 0.779 0.436533
## DistanceFromHome -9.438e+00 6.605e+00 -1.429 0.153575
## Education -3.473e+01 5.387e+01 -0.645 0.519411
## EnvironmentSatisfaction -7.812e+01 5.001e+01 -1.562 0.118825
## HourlyRate 2.485e+00 2.736e+00 0.908 0.364017
## JobLevel 3.855e+03 7.921e+01 48.663 < 2e-16 ***
## MonthlyRate -3.306e-03 7.861e-03 -0.421 0.674210
## NumCompaniesWorked -1.281e+01 2.525e+01 -0.507 0.612254
## PercentSalaryHike 6.575e+01 2.350e+01 2.798 0.005318 **
## StockOptionLevel -1.793e+02 8.400e+01 -2.134 0.033243 *
## TotalWorkingYears 6.542e+01 1.597e+01 4.097 4.78e-05 ***
## TrainingTimesLastYear 3.740e+01 4.279e+01 0.874 0.382489
## YearsAtCompany 8.253e+00 2.114e+01 0.390 0.696409
## YearsInCurrentRole -1.778e+01 2.649e+01 -0.671 0.502415
## YearsSinceLastPromotion 2.117e+01 2.250e+01 0.941 0.347036
## YearsWithCurrManager -4.779e+01 2.602e+01 -1.837 0.066764 .
## AttritionNum 2.007e+02 1.731e+02 1.160 0.246598
## BusinessTravelNum -5.248e+01 1.016e+02 -0.516 0.605720
## DepartmentNum -9.042e+02 1.455e+02 -6.214 9.91e-10 ***
## EducationFieldNum -3.453e+01 4.097e+01 -0.843 0.399711
## GenderNum 1.724e+02 1.100e+02 1.567 0.117768
## JobRoleNum 1.240e+02 3.297e+01 3.759 0.000188 ***
## MaritalStatusNum -1.655e+02 1.000e+02 -1.655 0.098432 .
## OverTimeNum -1.505e+02 1.270e+02 -1.185 0.236467
## JobInvolvementNum 1.360e+02 7.699e+01 1.766 0.077947 .
## JobSatisfactionNum 5.880e+01 4.972e+01 1.183 0.237462
## PerformanceRatingNum -4.984e+02 2.355e+02 -2.116 0.034769 *
## RelationshipSatisfactionNum -2.748e+00 4.856e+01 -0.057 0.954884
## WorkLifeBalanceNum -1.807e+01 8.006e+01 -0.226 0.821527
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 1313 on 577 degrees of freedom
## Multiple R-squared: 0.9264, Adjusted R-squared: 0.9224
## F-statistic: 234.3 on 31 and 577 DF, p-value: < 2.2e-16
lm.pred.full <- predict(fit1, newdata = reg_test[,names(reg_test) != "MonthlyIncome"])
actuals <- reg_test$MonthlyIncome
predictions <- lm.pred.full
RMSE <- sqrt(mean((actuals - predictions)^2))
RMSE
## [1] 1514.196
red_model1_sal <- -MonthlyIncome ~ PerformanceRatingNum + JobInvolvementNum + MaritalStatusNum + JobRoleNum + DepartmentNum + YearsWithCurrManager + TotalWorkingYears + StockOptionLevel + PercentSalaryHike + JobLevel
fit1 <- lm(red_model1_sal, data = reg_train)
summary(fit1)
##
## Call:
## lm(formula = red_model1_sal, data = reg_train)
##
## Residuals:
## Min 1Q Median 3Q Max
## -4371.7 -736.8 -44.8 785.6 5340.8
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) -138.45 661.04 -0.209 0.834177
## PerformanceRatingNum 525.10 233.35 2.250 0.024791 *
## JobInvolvementNum -119.95 75.10 -1.597 0.110764
## MaritalStatusNum 166.23 97.05 1.713 0.087257 .
## JobRoleNum -116.03 32.30 -3.592 0.000355 ***
## DepartmentNum 857.99 142.99 6.000 3.42e-09 ***
## YearsWithCurrManager 37.74 17.13 2.204 0.027937 *
## TotalWorkingYears -60.51 11.77 -5.141 3.71e-07 ***
## StockOptionLevel 192.72 82.44 2.338 0.019730 *
## PercentSalaryHike -65.89 23.23 -2.836 0.004721 **
## JobLevel -3845.69 77.31 -49.744 < 2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 1311 on 598 degrees of freedom
## Multiple R-squared: 0.9239, Adjusted R-squared: 0.9227
## F-statistic: 726.2 on 10 and 598 DF, p-value: < 2.2e-16
lm.pred.full <- predict(fit1, newdata = reg_test[,names(reg_test) != "MonthlyIncome"])
actuals <- reg_test$MonthlyIncome
predictions <- lm.pred.full
RMSE <- sqrt(mean((actuals - predictions)^2))
RMSE
## [1] 14999.37
red_model2_sal <- MonthlyIncome ~ JobRoleNum +
DepartmentNum + TotalWorkingYears +
PercentSalaryHike + JobLevel +
YearsWithCurrManager
fit2 <- lm(red_model2_sal, data = reg_train)
summary(fit2)
##
## Call:
## lm(formula = red_model2_sal, data = reg_train)
##
## Residuals:
## Min 1Q Median 3Q Max
## -5432.3 -806.7 2.4 765.4 4314.4
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) -1036.19 346.50 -2.990 0.002899 **
## JobRoleNum 124.48 32.40 3.842 0.000135 ***
## DepartmentNum -898.33 143.48 -6.261 7.27e-10 ***
## TotalWorkingYears 59.06 11.84 4.988 7.99e-07 ***
## PercentSalaryHike 25.29 14.45 1.751 0.080509 .
## JobLevel 3861.90 77.60 49.766 < 2e-16 ***
## YearsWithCurrManager -40.66 17.21 -2.363 0.018461 *
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 1320 on 602 degrees of freedom
## Multiple R-squared: 0.9224, Adjusted R-squared: 0.9216
## F-statistic: 1192 on 6 and 602 DF, p-value: < 2.2e-16
lm.pred.full <- predict(fit2, newdata = reg_test[,names(reg_test) != "MonthlyIncome"])
actuals <- reg_test$MonthlyIncome
predictions <- lm.pred.full
RMSE <- sqrt(mean((actuals - predictions)^2))
RMSE
## [1] 1446.239
red_model3_sal <- MonthlyIncome ~ JobRoleNum + DepartmentNum + TotalWorkingYears + PercentSalaryHike + JobLevel + YearsAtCompany
fit3 <- lm(red_model3_sal, data = reg_train)
summary(fit3)
##
## Call:
## lm(formula = red_model3_sal, data = reg_train)
##
## Residuals:
## Min 1Q Median 3Q Max
## -5318.1 -769.4 5.2 774.2 4334.8
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) -1043.57 347.82 -3.000 0.00281 **
## JobRoleNum 127.58 32.50 3.925 9.66e-05 ***
## DepartmentNum -924.20 143.43 -6.443 2.39e-10 ***
## TotalWorkingYears 56.89 12.46 4.566 6.04e-06 ***
## PercentSalaryHike 25.00 14.50 1.724 0.08520 .
## JobLevel 3862.84 78.00 49.524 < 2e-16 ***
## YearsAtCompany -13.59 11.76 -1.156 0.24818
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 1325 on 602 degrees of freedom
## Multiple R-squared: 0.9218, Adjusted R-squared: 0.921
## F-statistic: 1183 on 6 and 602 DF, p-value: < 2.2e-16
lm.pred.full <- predict(fit3, newdata = reg_test[,names(reg_test) != "MonthlyIncome"])
actuals <- reg_test$MonthlyIncome
predictions <- lm.pred.full
RMSE <- sqrt(mean((actuals - predictions)^2))
RMSE
## [1] 1453.472
Conclusion The Reduced Model2 has less RMSE. Lets uses that for salary prediction
str(no_salary_data)
## 'data.frame': 300 obs. of 35 variables:
## $ ID : int 871 872 873 874 875 876 877 878 879 880 ...
## $ Age : int 43 33 55 36 27 39 33 21 30 51 ...
## $ Attrition : Factor w/ 2 levels "No","Yes": 1 1 2 1 1 2 1 2 1 1 ...
## $ BusinessTravel : Factor w/ 3 levels "Non-Travel","Travel_Frequently",..: 2 3 3 1 3 3 1 2 2 3 ...
## $ DailyRate : int 1422 461 267 1351 1302 895 750 251 1312 1405 ...
## $ Department : Factor w/ 3 levels "Human Resources",..: 3 2 3 2 2 3 3 2 2 2 ...
## $ DistanceFromHome : int 2 13 13 9 19 5 22 10 23 11 ...
## $ Education : int 4 1 4 4 3 3 2 2 3 2 ...
## $ EducationField : Factor w/ 6 levels "Human Resources",..: 2 2 3 2 5 6 3 2 2 6 ...
## $ EmployeeCount : int 1 1 1 1 1 1 1 1 1 1 ...
## $ EmployeeNumber : int 1849 995 1372 1949 1619 42 160 1279 159 1367 ...
## $ EnvironmentSatisfaction : int 1 2 1 1 4 4 3 1 1 4 ...
## $ Gender : Factor w/ 2 levels "Female","Male": 2 1 2 2 2 2 2 1 2 1 ...
## $ HourlyRate : int 92 53 85 66 67 56 95 45 96 82 ...
## $ JobInvolvement : int 3 3 4 4 2 3 3 2 1 2 ...
## $ JobLevel : int 2 1 4 1 1 2 2 1 1 4 ...
## $ JobRole : Factor w/ 9 levels "Healthcare Representative",..: 8 7 8 3 3 9 8 3 7 5 ...
## $ JobSatisfaction : int 4 4 3 2 1 4 2 3 3 2 ...
## $ MaritalStatus : Factor w/ 3 levels "Divorced","Married",..: 2 3 3 2 1 2 2 3 1 3 ...
## $ MonthlyRate : int 19246 17241 9277 9238 16290 3335 15480 25308 22310 24439 ...
## $ NumCompaniesWorked : int 1 3 6 1 1 3 0 1 1 3 ...
## $ Over18 : Factor w/ 1 level "Y": 1 1 1 1 1 1 1 1 1 1 ...
## $ OverTime : Factor w/ 2 levels "No","Yes": 1 1 2 1 1 1 1 1 1 1 ...
## $ PercentSalaryHike : int 20 18 17 22 11 14 13 20 25 16 ...
## $ PerformanceRating : int 4 3 3 4 3 3 3 4 4 3 ...
## $ RelationshipSatisfaction: int 3 1 3 2 1 3 1 3 3 2 ...
## $ StandardHours : int 80 80 80 80 80 80 80 80 80 80 ...
## $ StockOptionLevel : int 1 0 0 0 2 1 1 0 3 0 ...
## $ TotalWorkingYears : int 7 5 24 5 7 19 8 2 10 29 ...
## $ TrainingTimesLastYear : int 5 4 2 3 3 6 2 2 2 1 ...
## $ WorkLifeBalance : int 3 3 2 3 3 4 4 1 2 2 ...
## $ YearsAtCompany : int 7 3 19 5 7 1 7 2 10 5 ...
## $ YearsInCurrentRole : int 7 2 7 4 7 0 7 2 7 2 ...
## $ YearsSinceLastPromotion : int 7 0 3 0 0 0 0 2 0 0 ...
## $ YearsWithCurrManager : int 7 2 8 2 7 0 7 2 9 3 ...
pred_data <- conv.attr.reg(no_salary_data)
lm.model.reduced <- lm(red_model2_sal, data = reg_train)
lm.pred.salary <- predict(lm.model.reduced, newdata = pred_data)
no.salary.prediction <- data.frame(ID = pred_data$ID, MonthlyIncome = round(lm.pred.salary))
path_out2 = '/Users/lijjumathew/Code/Github/MSDS-Doing-Datascience-Project2/Results/'
fileName2 = paste(path_out, 'Case2PredictionsMathewSalary.csv',sep = '')
#write.csv(no.salary.prediction, fileName2, row.names = F)